# NER (Named Entity Recognition)


# 1 Initial Project Setup (Data Ingestion)

In [None]:
# Uncomment and run this cell if you're on Colab or Kaggle
"""
By pretraining on huge corpora across many languages, 
these multilingual transformers enable zero-shot cross-lingual transfer. 
This means that a model that is fine-tuned on one language can be applied to others without any further training!

"""
!git clone https://github.com/nlp-with-transformers/notebooks.git



Cloning into 'notebooks'...
remote: Enumerating objects: 422, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 422 (delta 0), reused 5 (delta 0), pack-reused 416[K
Receiving objects: 100% (422/422), 24.97 MiB | 24.35 MiB/s, done.
Resolving deltas: 100% (190/190), done.
⏳ Installing base requirements ...
✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


In [2]:
import os
os.chdir("/content/drive/MyDrive/FDFS_feb_Batch/NLP/NER task/notebooks")


from install import *
install_requirements()

⏳ Installing base requirements ...
✅ Base requirements installed!
⏳ Installing Git LFS ...
✅ Git LFS installed!


# Device Setup Pytorch

In [3]:
from utils import *
setup_chapter()


import pandas as pd
import numpy as np 
import torch 

print(torch.__version__)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

No GPU was detected! This notebook can be *very* slow without a GPU 🐢
Go to Runtime > Change runtime type and select a GPU hardware accelerator.
Using transformers v4.11.3
Using datasets v1.16.1
1.12.0+cu113
cpu


# DownLoading Benchmark Dataset 

In [4]:
# Data ingestion Phase

from datasets import get_dataset_config_names
from datasets import load_dataset
from datasets import load_from_disk



In [None]:
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]

In [None]:
print("All PanX languages \n")
print(panx_subsets)

en = load_dataset("xtreme", name='PAN-X.en')

print("English Dataset \n")
print(en)

All PanX languages

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de', 'PAN-X.el',
'PAN-X.en', 'PAN-X.es', 'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi',
'PAN-X.fr', 'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it',
'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko', 'PAN-X.ml',
'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl', 'PAN-X.pt', 'PAN-X.ru',
'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te', 'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr',
'PAN-X.ur', 'PAN-X.vi', 'PAN-X.yo', 'PAN-X.zh']
Downloading and preparing dataset xtreme/PAN-X.en (download: 223.17 MiB, generated: 7.30 MiB, post-processed: Unknown size, total: 230.47 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/2fc6b63c5326cc0d1f73060649612889b3a7ed8a6605c91cecdbd228a7158b17...


Downloading:   0%|          | 0.00/234M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/2fc6b63c5326cc0d1f73060649612889b3a7ed8a6605c91cecdbd228a7158b17. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

English Dataset

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
})


# Data Exploration /  Configuration File 

In [5]:
%cd /content/drive/MyDrive/FDFS_feb_Batch/NLP/NER task

/content/drive/MyDrive/FDFS_feb_Batch/NLP/NER task


In [6]:
en = load_from_disk("./artifacts/data_store")

In [7]:
en

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
})

# Data Exploration / Configuration

### Train Data Exploration

In [None]:
len(en["train"])

20000

In [None]:
# print only the first data 
for i in en["train"]:
  print(pd.DataFrame(i))
  break


      tokens  ner_tags langs
0       R.H.         3    en
1   Saunders         4    en
2          (         0    en
3        St.         3    en
4   Lawrence         4    en
5      River         4    en
6          )         0    en
7          (         0    en
8        968         0    en
9         MW         0    en
10         )         0    en


In [None]:
pd.DataFrame(en["train"][0]).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,R.H.,Saunders,(,St.,Lawrence,River,),(,968,MW,)
ner_tags,3,4,0,3,4,4,0,0,0,0,0
langs,en,en,en,en,en,en,en,en,en,en,en


In [None]:
pd.DataFrame(en["train"][100]).transpose()

Unnamed: 0,0,1,2,3,4
tokens,List,of,years,in,Brazil
ner_tags,3,4,4,4,4
langs,en,en,en,en,en


In [None]:
df = pd.DataFrame(en["train"])
df

Unnamed: 0,tokens,ner_tags,langs
0,"[R.H., Saunders, (, St., Lawrence, River, ), (...","[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]","[en, en, en, en, en, en, en, en, en, en, en]"
1,"[;, ', '', Anders, Lindström, '', ']","[0, 0, 0, 1, 2, 0, 0]","[en, en, en, en, en, en, en]"
2,"[Karl, Ove, Knausgård, (, born, 1968, )]","[1, 2, 2, 0, 0, 0, 0]","[en, en, en, en, en, en, en]"
3,"[Atlantic, City, ,, New, Jersey]","[5, 6, 6, 6, 6]","[en, en, en, en, en]"
4,"[Her, daughter, from, the, second, marriage, w...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
...,...,...,...
19995,"[Cicely, Courtneidge, ,, Ernest, Truex]","[1, 2, 0, 1, 2]","[en, en, en, en, en]"
19996,"[Aracaju, ,, Sergipe, ,, Brazil]","[5, 0, 5, 0, 5]","[en, en, en, en, en]"
19997,"[Louisville, in, the, American, Civil, War]","[5, 6, 6, 6, 6, 6]","[en, en, en, en, en, en]"
19998,"[16, (, David, Nugent, )]","[0, 0, 1, 2, 0]","[en, en, en, en, en]"


### NER Tags Exploration

We have 7 classes 
- 0 : 'O' --> other
- 1 : 'B-PER' --> (Entity is begining with person. Ex: Sentence is starting with person)
- 2 : 'I-PER' --> (Entity have a person in between. Ex: In a sentence a person is somewhere in the middle)
- 3 : 'B-ORG' --> (Entity is begining with name of organisation)
- 4 : 'I-ORG' --> (Entity have organisation somewhere in the middle)
- 5 : 'B-LOC' --> (Entity is begining with name of Location)
- 6 : 'I-LOC' --> (Entity have Location somewhere int he middle)

In [None]:
" ".join(en["train"][1]["tokens"])


"; ' '' Anders Lindström '' '"

In [None]:
len(" ".join(en["train"][1]["tokens"]))

28

In [None]:
en["train"][1]["ner_tags"]

[0, 0, 0, 1, 2, 0, 0]

In [None]:
en["train"]

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 20000
})

In [None]:
en["train"].features

{'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
en["train"].features["ner_tags"]

Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None), length=-1, id=None)

In [None]:
en["train"].features["ner_tags"].feature

ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None)

In [None]:
en["train"].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

# Data Validation

In [None]:
en_dict = dict(en)

In [None]:
type(en_dict)

dict

### column length check

In [None]:

col_names = ["tokens","ner_tags","langs"]
splits = ["train","test","validation"]
result = list()
for split in splits:
  result.append(
      sum(pd.DataFrame(en_dict[split]).columns == col_names) )
  
result


[3, 3, 3]

In [None]:
checks_results = list()

In [None]:
if sum(result) == len(col_names) * len(splits):
  checks_results.append(True)
else:
  checks_results.append(True)

In [None]:
checks_results

[True]

### column type check

In [None]:
en_dict["train"]

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 20000
})

In [None]:
en_dict["train"].features

{'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], names_file=None, id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
en_dict["train"].features["langs"].feature.dtype

'string'

In [None]:
en_dict["train"].features["tokens"].feature.dtype

'string'

In [None]:
en_dict["train"].features["ner_tags"].feature.dtype

'int64'

In [None]:
splits = ["train","test","validation"]
col_names = ["tokens","langs","ner_tags"]
types = ["string","int64"]
result = list()
for split in splits:
  count = 0
  for col_name in col_names:
    if(en_dict[split].features[col_name].feature.dtype in types):
      count+=1
  result.append(count)
  print("/n")

/n
/n
/n


In [None]:
result

[3, 3, 3]

### Null Value Check

In [None]:
pd.DataFrame(en["train"]).isnull().values.any()

False

In [None]:
pd.DataFrame(en["test"]).isnull().values.any()

False

In [None]:
pd.DataFrame(en["validation"]).isnull().values.any()

False

In [None]:
lst = [False,False,False]
if sum(lst) == 0:
  print(True)
else:
  print(False)

True


# Data Preprocessing

### Tags Object

In [None]:
tags = en["train"].features["ner_tags"].feature
print(tags)

ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG',
'B-LOC', 'I-LOC'], names_file=None, id=None)


In [None]:
tags.int2str(0)

'O'

In [None]:
tags.str2int('B-PER')

1

### NER-tags to Token-tags

In [None]:
tags = en["train"].features["ner_tags"].feature
print(tags)

ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG',
'B-LOC', 'I-LOC'], names_file=None, id=None)


We have 7 classes

- 0 : 'O'
- 1 : 'B-PER' --> (Entity is begining with person. Ex: Sentence is starting with person)
- 2 : 'I-PER' --> (Entity have a person in between. Ex: In a sentence a peson is somewhere in the middle)
- 3 : 'B-ORG' --> (Entity is begining with name of organisation)
- 4 : 'I-ORG' --> (Entity have organisation somewhere int he middle)
- 5 : 'B-LOC' --> (Entity is begining with name of Location)
- 6 : 'I-LOC' --> (Entity have Location somewhere int he middle)

In [None]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
# Tags to tokens

# Train, Validation and Test
# Train features -> tokens, ner_tags, lang
# select ner_tags -> apply int2str() as below


tags = en["train"].features["ner_tags"].feature
print(tags)

def create_tag_name(batch):
  return {"ner_tags_str":[tags.int2str(idx) for idx in batch["ner_tags"]]}

# mapping this to all train, test and validation data
new_en = en.map(create_tag_name)

ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG',
'B-LOC', 'I-LOC'], names_file=None, id=None)


  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

In [None]:
pd.DataFrame(new_en["train"])

Unnamed: 0,tokens,ner_tags,langs,ner_tags_str
0,"[R.H., Saunders, (, St., Lawrence, River, ), (...","[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]","[en, en, en, en, en, en, en, en, en, en, en]","[B-ORG, I-ORG, O, B-ORG, I-ORG, I-ORG, O, O, O..."
1,"[;, ', '', Anders, Lindström, '', ']","[0, 0, 0, 1, 2, 0, 0]","[en, en, en, en, en, en, en]","[O, O, O, B-PER, I-PER, O, O]"
2,"[Karl, Ove, Knausgård, (, born, 1968, )]","[1, 2, 2, 0, 0, 0, 0]","[en, en, en, en, en, en, en]","[B-PER, I-PER, I-PER, O, O, O, O]"
3,"[Atlantic, City, ,, New, Jersey]","[5, 6, 6, 6, 6]","[en, en, en, en, en]","[B-LOC, I-LOC, I-LOC, I-LOC, I-LOC]"
4,"[Her, daughter, from, the, second, marriage, w...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e...","[O, O, O, O, O, O, O, B-PER, I-PER, O, O, O, O..."
...,...,...,...,...
19995,"[Cicely, Courtneidge, ,, Ernest, Truex]","[1, 2, 0, 1, 2]","[en, en, en, en, en]","[B-PER, I-PER, O, B-PER, I-PER]"
19996,"[Aracaju, ,, Sergipe, ,, Brazil]","[5, 0, 5, 0, 5]","[en, en, en, en, en]","[B-LOC, O, B-LOC, O, B-LOC]"
19997,"[Louisville, in, the, American, Civil, War]","[5, 6, 6, 6, 6, 6]","[en, en, en, en, en, en]","[B-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC]"
19998,"[16, (, David, Nugent, )]","[0, 0, 1, 2, 0]","[en, en, en, en, en]","[O, O, B-PER, I-PER, O]"


In [None]:
pd.DataFrame(new_en["train"][100])

Unnamed: 0,tokens,ner_tags,langs,ner_tags_str
0,List,3,en,B-ORG
1,of,4,en,I-ORG
2,years,4,en,I-ORG
3,in,4,en,I-ORG
4,Brazil,4,en,I-ORG


In [None]:
pd.DataFrame(new_en["train"][100]).transpose()

Unnamed: 0,0,1,2,3,4
tokens,List,of,years,in,Brazil
ner_tags,3,4,4,4,4
langs,en,en,en,en,en
ner_tags_str,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG


In [None]:
new_en

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 20000
    })
})

In [None]:
de_example = new_en["train"][100]
pd.DataFrame([de_example["tokens"],de_example["ner_tags"] ,de_example["ner_tags_str"], de_example["langs"]],
['Tokens',"ner_tags" ,'ner_tags_str','language'])


Unnamed: 0,0,1,2,3,4
Tokens,List,of,years,in,Brazil
ner_tags,3,4,4,4,4
ner_tags_str,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG
language,en,en,en,en,en


### Xlmr-Tokenizer
- XLM-R stands for(XLM)cross language modelling and (R)Roberta is special model for cross entity language modelling
- XLM-R -> have vocab size 250,000 words
- Instead of using a WordPiece tokenizer, XLM-R uses a sentence tokenizer called SentencePiece.
- this tokenizer preserve White spaces using _ .
- Vocab - After tokenization replace with index position



In [8]:
# downloading tokenizer 
def get_model_and_tokenizer():
  from transformers import AutoTokenizer 
  if not os.path.isdir(path):
    os.mkdir(path)
    xlmr_model_name = "xlm-roberta-base"
    xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
    xlmr_tokenizer.save_pretrained(path)
    return xlmr_model_name, xlmr_tokenizer
  else:
    xlmr_model_name = "xlm-roberta-base"
    xlmr_tokenizer = AutoTokenizer.from_pretrained(path)
    return xlmr_model_name, xlmr_tokenizer

In [10]:
xlmr_model_name, xlmr_tokenizer = get_model_and_tokenizer()

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [None]:
# Getting tokeninzed output of the sentence

xlmr_tokenizer("hello my name is akash and I working as assistant professor")

{'input_ids': [0, 33600, 31, 759, 9351, 83, 10, 90595, 136, 87, 20697, 237, 195644, 16030, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
xlmr_tokens = xlmr_tokenizer('hello my name is akash and I working as assistant professor').tokens()
xlmr_tokens 

['<s>',
 '▁hell',
 'o',
 '▁my',
 '▁name',
 '▁is',
 '▁a',
 'kash',
 '▁and',
 '▁I',
 '▁working',
 '▁as',
 '▁assistant',
 '▁professor',
 '</s>']

# Tokenizing Texts for NER

Lets pick up some random datapoint from train data and tokenize it using XLMR tokenizer.

In [None]:
de_example = new_en["train"][8]

In [None]:
# In this example we can see that the tokenizer has split, 
# Since we’re following the convention that only first word
# should be associated with the B-LOC label, we need a way to mask the 
# subword representations after the first subword. Fortunately, 
# tokenized_input is a class that contains a word_ids() function that can help us achieve this:

words, labels = de_example["tokens"], de_example["ner_tags"]
words, labels

(['*Inducted',
  'into',
  'the',
  'United',
  'States',
  'Hockey',
  'Hall',
  'of',
  'Fame',
  'in',
  '2015'],
 [0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 0])

In [11]:
# create integer tokens and attention mask from the sentence
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokenized_input

NameError: ignored

In [None]:
# getting word tokens from the integer tokens
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>',
 '▁*',
 'In',
 'duct',
 'ed',
 '▁into',
 '▁the',
 '▁United',
 '▁States',
 '▁Hockey',
 '▁Hall',
 '▁of',
 '▁Fam',
 'e',
 '▁in',
 '▁2015',
 '</s>']

In [None]:
# for each word token we will provide it with the word id.
word_ids = tokenized_input.word_ids()
word_ids

[None, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]

In [None]:
print(f"length of actual sentence: {len(words)-2}")
print(f"length of ner_tags/labels : {len(labels)-2}")
print(f"length of tokenized words : {len(tokens)-2}")

length of actual sentence: 9
length of ner_tags/labels : 9
length of tokenized words : 15


- we can observe that we have got more no. of tokenized words then the actual words as XLMR tokenizer is internally performing some stemming operation

  - notice word "Inducted" is tokenized as "_*", "In", "duct", "ed" and this is 1 complete word without and underscore symbol(_) so this we got 4 zeros, what happens internally is the loop looks for is underscores to understand it is next word and then gives wordid to the next word. so the wordids for inducted is [0,0,0,0]


  - notice word "Fame" is tokenized as "_Fam", "e" so both have got wordid [8,8] 

- Other thing that we can observe here is, instead of spaces XLMR uses "_" to denote spaces

Now the problem here is we have actual sentence length of 9 --> labels are also 9(excluding start and end tags) but the no. of tokenized words are 15.

In [None]:
# Actual label of the sentence
[index2tag[idx] for idx in de_example["ner_tags"]][1:-1]

['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']

In [None]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Tokens,<s>,▁*,In,duct,ed,▁into,▁the,▁United,▁States,▁Hockey,▁Hall,▁of,▁Fam,e,▁in,▁2015,</s>
Word IDs,,0,0,0,0,1,2,3,4,5,6,7,8,8,9,10,
Label IDs,-100,0,-100,-100,-100,0,0,3,4,4,4,4,4,-100,0,0,-100
Labels,IGN,O,IGN,IGN,IGN,O,O,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,IGN,O,O,IGN


In [None]:
de_example = new_en["train"][8]
pd.DataFrame([de_example["tokens"],de_example["ner_tags"] ,de_example["ner_tags_str"], de_example["langs"]],
['Tokens',"ner_tags" ,'ner_tags_str','language'])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Tokens,*Inducted,into,the,United,States,Hockey,Hall,of,Fame,in,2015
ner_tags,0,0,0,3,4,4,4,4,4,0,0
ner_tags_str,O,O,O,B-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,O,O
language,en,en,en,en,en,en,en,en,en,en,en


So now what we need to do?

- for model training we do not need "Tokens", "language" fields as these are only strings so we will remove them.

- We will also remove "ner_tags", instead of this we will introduce field "Labels" which will also have -100 value for each tokenized word.

- For field "ner_tags_str" we will not do anything, there is no need to append "IGN(Ignore)" tag 

- we will add "attention mask", "input_ids" along with "Labels", "ner_tags_str" fields





In [None]:
for idx, label in enumerate(de_example["ner_tags"]):
  print(idx)
  print(label)


0
0
1
0
2
0
3
3
4
4
5
4
6
4
7
4
8
4
9
0
10
0


In [None]:
# apply this to whole dataset
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['langs', 'ner_tags', 'tokens'])

In [None]:
panx_en_encoded = encode_panx_dataset(new_en) 

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [None]:
panx_en_encoded

DatasetDict({
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 20000
    })
})

previous Features:

- <b>['tokens', 'ner_tags', 'langs', 'ner_tags_str']</b>

New features: 

- <b>['attention_mask', 'input_ids', 'labels', 'ner_tags_str']</b>

/content/drive/MyDrive/FDFS_feb_Batch/NLP/NER task


In [None]:
# saving data to disk
panx_en_encoded.save_to_disk("./artifacts/serialized_objects")


# Custom Model Building with Pytorch and hugging face
## Model Architecture file

In [12]:
# reloading the data
panx_en_encoded = load_from_disk("./artifacts/serialized_objects")
panx_en_encoded

DatasetDict({
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'ner_tags_str'],
        num_rows: 20000
    })
})

![nlpt_0404](https://user-images.githubusercontent.com/40850370/176989445-bbf9d48d-3244-4176-b7e0-9440a4ff1b37.png)


In [13]:
# we will use RoBERTa as the base model but augmented with settings specific to XLM-R. 
# The config_class ensures that the standard XLM-R settings are used when we initialize a new model.
# Note that we set add_​pool⁠ing_layer=False to ensure all hidden states are returned and not only the one associated with the [CLS] token.
# Finally, we initialize all the weights by calling the init_weights()


import torch.nn as nn
from transformers import XLMRobertaConfig # this class will get every model configuration settings of roberta model
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig

  # notice :
  # XLMRobertaConfig is the configuration on which the roboerta is pretrained
  # config is the configuration we modify using Autoconfig for fine tuning the model

  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels

    #Load model body
    self.roberta = RobertaModel(config, add_pooling_layer=False)

    # setup token classification head
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier=nn.Linear(config.hidden_size, config.num_labels)

    # load and initialize weights/ pretrained of roberta model
    # init_weights() belongs to RobertaPreTrainedModel class which we are inheriting in the __init__ constructor
    self.init_weights()

  def forward(self, input_ids=None, attention_mask=None, token_type_ids = None, labels=None, **kwargs):
    # use model body to get encoder representations
    outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
    
    # Apply classifire to encoder representation
    sequence_output = self.dropout(outputs[0])
    logits = self.classifier(sequence_output)
    
    # calculate losses
    loss = None

    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    # Return model output object
    return TokenClassifierOutput(loss=loss,logits=logits, 
                                 hidden_states=outputs.hidden_states,
                                 attentions=outputs.attentions)



## Auto Configuration

In [14]:
en = load_from_disk("./artifacts/data_store")
tags = en["train"].features["ner_tags"].feature
print(tags)

ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG',
'B-LOC', 'I-LOC'], names_file=None, id=None)


In [15]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [16]:
index2tag

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [17]:
tag2index

{'B-LOC': 5,
 'B-ORG': 3,
 'B-PER': 1,
 'I-LOC': 6,
 'I-ORG': 4,
 'I-PER': 2,
 'O': 0}

- The AutoCofig class contains the blueprint of a model's architecture
- Some addition information beyond the model name, including the tags that we will use to label each entity and the mapping of each tag to an ID and vice versa.

In [18]:
xlmr_model_name, xlmr_tokenizer = get_model_and_tokenizer()

In [19]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels= tags.num_classes,
                                         id2label= index2tag, label2id=tag2index)


In [20]:
xlmr_model_name

'xlm-roberta-base'

In [21]:
xlmr_config

XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

To summarize AutoConfig:
1. For fine-tuning we need to provide the model name, no. of classes, and many other configurations on which we want to tune our custom data.
2. All these configuration information we provide in AutoConfig.

## Quick Prediction 
Lets try to predict <b>without fine tuning</b>

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

cuda


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

#### EXAMPLE 1

In [None]:
datapoint = panx_en_encoded["train"]["input_ids"][100]
actual_tags = panx_en_encoded["train"]["ner_tags_str"][100]
print(datapoint)
tokenized_form = xlmr_tokenizer.convert_ids_to_tokens(datapoint)
actual_form = xlmr_tokenizer.convert_tokens_to_string(tokenized_form)
print(f"Tokenized form \n {tokenized_form}")
print(f"Actual sentence \n {actual_form}")
print(f"ACTUAL TAGS \n {actual_tags}")

[0, 32036, 111, 5369, 23, 30089, 2]
Tokenized form
 ['<s>', '▁List', '▁of', '▁years', '▁in', '▁Brazil', '</s>']
Actual sentence
 <s> List of years in Brazil</s>
ACTUAL TAGS
 ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']


In [None]:
# convert input_ids into tokens
data = torch.tensor(datapoint)
print(data)
data = data.reshape(1,-1)
print(data)

# applying predictions
# prediction without using fine tuned model
outputs = xlmr_model(data.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"\nNumber of tokens in sequence: {len(data[0])}")
print(f"Shape of outputs: {outputs.shape}")

print(outputs)

print("\n\nPREDICTED TAGS")
pred_tags = [index2tag[i.item()] for i in predictions[0][1:-1]]
print(pred_tags)

print(f"ACTUAL TAGS \n {actual_tags}")


tensor([    0, 32036,   111,  5369,    23, 30089,     2])
tensor([[    0, 32036,   111,  5369,    23, 30089,     2]])

Number of tokens in sequence: 7
Shape of outputs: torch.Size([1, 7, 7])
tensor([[[ 0.5796,  0.6435, -0.3025,  0.3407,  0.4078,  0.3925,  0.1479],
         [ 0.6084,  0.6773, -0.5249,  0.0392,  0.6703,  0.4355,  0.2825],
         [ 0.5010,  0.7136, -0.4612,  0.1292,  0.5524,  0.3704,  0.2346],
         [ 0.6157,  0.6970, -0.4364,  0.0424,  0.6584,  0.4871,  0.3452],
         [ 0.4934,  0.7942, -0.4836,  0.1831,  0.5436,  0.3443,  0.2360],
         [ 0.5380,  0.7606, -0.4919,  0.0178,  0.6239,  0.4657,  0.1835],
         [ 0.5865,  0.6369, -0.3402,  0.3266,  0.3989,  0.4070,  0.1525]]],
       device='cuda:0', grad_fn=<ViewBackward0>)


PREDICTED TAGS
['B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER']
ACTUAL TAGS
 ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']


Observation:
1. the PREDICTED and ACTUAL TAGS are completely mismatched, obviously because model is not yet fine tuned

2. The output we have got have size [1,7,7], which means ⤵
  - 1 -> means we have 1 data point
  - 7 -> is the no. of tokens in data point
  - 7 -> for each single token we have 7 probabilities values for seven classes.


#### EXAMPLE 2

In [None]:
datapoint = panx_en_encoded["train"]["input_ids"][15]
actual_tags = panx_en_encoded["train"]["ner_tags_str"][15]
print(datapoint)
tokenized_form = xlmr_tokenizer.convert_ids_to_tokens(datapoint)
actual_form = xlmr_tokenizer.convert_tokens_to_string(tokenized_form)

print(f"Tokenized form \n {tokenized_form}")
print(f"Length of Tokenized form -> {len(tokenized_form)} \n")
print(f"Actual sentence \n {actual_form}")
print(f"Length of actual sentence -> {len(actual_form.split())+1} \n")
print(f"ACTUAL TAGS \n {actual_tags}")

[0, 54041, 24748, 36216, 6, 4, 51978, 111, 166207, 3956, 136, 147202, 46542, 2]
Tokenized form
 ['<s>', '▁Prince', '▁Albert', '▁Victor', '▁', ',', '▁Duke', '▁of', '▁Clare',
'nce', '▁and', '▁Avon', 'dale', '</s>']
Length of Tokenized form -> 14

Actual sentence
 <s> Prince Albert Victor , Duke of Clarence and Avondale</s>
Length of actual sentence -> 11

ACTUAL TAGS
 ['B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
'I-PER']


In [None]:
# convert input_ids into tokens
data = torch.tensor(datapoint)
print(data)
data = data.reshape(1,-1)
print(data)

# applying predictions
# prediction without using fine tuned model
outputs = xlmr_model(data.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"\nNumber of tokens in sequence: {len(data[0])}")
print(f"Shape of outputs: {outputs.shape}")

print(outputs)

print("\n\nPREDICTED TAGS")
pred_tags = [index2tag[i.item()] for i in predictions[0][1:-1]]
print(pred_tags)

print(f"ACTUAL TAGS \n {actual_tags}")


tensor([     0,  54041,  24748,  36216,      6,      4,  51978,    111, 166207,
          3956,    136, 147202,  46542,      2])
tensor([[     0,  54041,  24748,  36216,      6,      4,  51978,    111, 166207,
           3956,    136, 147202,  46542,      2]])

Number of tokens in sequence: 14
Shape of outputs: torch.Size([1, 14, 7])
tensor([[[ 0.6060,  0.6641, -0.2747,  0.3874,  0.3721,  0.3519,  0.1463],
         [ 0.6160,  0.7792, -0.5447,  0.0417,  0.7667,  0.5195,  0.2494],
         [ 0.6045,  0.7397, -0.5581,  0.1025,  0.7120,  0.5026,  0.2222],
         [ 0.5995,  0.7003, -0.5161,  0.1147,  0.6785,  0.5097,  0.2449],
         [ 0.5581,  0.6442, -0.4995,  0.1867,  0.5457,  0.4608,  0.2397],
         [ 0.5747,  0.7933, -0.4060,  0.0116,  0.6090,  0.4748,  0.1945],
         [ 0.5854,  0.8002, -0.5428, -0.0370,  0.7798,  0.4987,  0.3361],
         [ 0.5499,  0.7896, -0.4608,  0.1057,  0.6387,  0.3614,  0.3080],
         [ 0.6699,  0.7687, -0.5598,  0.0736,  0.7365,  0.5093,  0.2796]

#### EXAMPLE 3

In [None]:
text ='Hello my name is Akash soni'
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits

predictions = torch.argmax(outputs, dim=-1)
print(f"Shape of outputs: {outputs.shape}")

Shape of outputs: torch.Size([1, 10, 7])


In [None]:
data = [i.item() for i in predictions[0]]
data

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
# Model prediction
[index2tag[idx] for idx in data][1:-1]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

It is very simple to understand that model is not able to make any constructive predictions, so we will have to go for the fine tuning

# Fine Tuning 

### Prediction Matrics

#### Evaluating a NER model is similar to evaluating a text classification model, and it is common to report results for precision, recall, and F1-score. The only subtlety is that all words of an entity need to be predicted correctly in order for a prediction to be counted as correct.

In [None]:
# class 1, class 2 , class 3 , class 4 

# f1 score 

# 0 -- 0  -- 1
# 0 -- 1 -- loss


In [None]:
from seqeval.metrics import classification_report

In [None]:
y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

y_pred = [["O", "O", "B-MISC", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]


print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.50      1.00      0.67         1
         PER       1.00      1.00      1.00         1

   micro avg       0.67      1.00      0.80         2
   macro avg       0.75      1.00      0.83         2
weighted avg       0.75      1.00      0.83         2



In [None]:
panx_en_encoded['train']['labels'][10]

[-100, 0, -100, -100, 0, 3, -100, 4, 0, 3, -100]

in our case we have also substituted -100 as IGNORE token so our data may look like

[-100, 0, -100, -100, 0, 3, -100, 4, 0, 3, -100]

but we need to skip all -100 while predicting, as we will have to compare actual labels with predicted labels

After ignoring -100 Actual labels looks like

[0, 0, 3, 4, 0, 3]

&

Predicted label can be 

[0, 1, 3, 4, 2, 5]

so to compare and get the loss out of it we need to remove -100

In [None]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape
    
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):

        example_labels, example_preds = [], []

        for seq_idx in range(seq_len):
            # Ignore label IDs = -100

            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

Once we are done with aligning the predictions we need to 
prepare training arguments for fine-Tuning.

***we only change the arguments we are interested into as there are many arguments which are better left for default.

In [None]:
from transformers import TrainingArguments

num_epochs = 10

batch_size = 24

logging_steps = len(panx_en_encoded["train"].select(range(100))) // batch_size

model_name = f"{xlmr_model_name}-finetuned-panx-en"

training_args = TrainingArguments(
    output_dir=model_name, 
    log_level="error", 
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    evaluation_strategy="epoch",
    save_steps=1e6, 
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps
    )

In [None]:
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=40,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=xlm-roberta-base-finetuned-panx-en/runs/Jul26_06-38-39_7c6a4de5116f,
logging_first_step=False,
logging_nan_

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
# The final step is to define a data collator so we can pad each input sequence 
# to the largest sequence length in a batch. nlpt_pin01 Transformers provides a
# dedicated data collator for token classification that will pad the labels along with the inputs:

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [None]:
from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=panx_en_encoded["train"].select(range(1000)),
                  eval_dataset=panx_en_encoded["validation"].select(range(100)),
                  tokenizer=xlmr_tokenizer
                  )

In [None]:
trainer

<transformers.trainer.Trainer at 0x7f8036f31ed0>

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.6089,0.631177,0.549669
2,0.4573,0.471467,0.630824
3,0.3339,0.530872,0.633452
4,0.2509,0.509041,0.661972
5,0.1972,0.522635,0.69258
6,0.1605,0.542197,0.697842
7,0.0822,0.555455,0.737226
8,0.0643,0.558622,0.727941
9,0.0313,0.556372,0.752768
10,0.0476,0.569119,0.745387


TrainOutput(global_step=420, training_loss=0.27094672520955404, metrics={'train_runtime': 121.9652, 'train_samples_per_second': 81.991, 'train_steps_per_second': 3.444, 'total_flos': 212137645273488.0, 'train_loss': 0.27094672520955404, 'epoch': 10.0})

In [None]:
!pwd

/content/drive/MyDrive/FDFS_feb_Batch/NLP/NER task


In [None]:
trainer.save_model("./artifacts/model_weights")

till here we have trained the model

# Model Prediction 

In [22]:
# loading preprocesed data
panx_en_encoded = load_from_disk("./artifacts/serialized_objects")

In [23]:
input_ids = xlmr_tokenizer(new_en["validation"][:10]["tokens"],truncation=True,
                                      is_split_into_words=True)


#### Loading Fine-tuned model
1. We will need the original architecture.
2. We will need Autoconfig as we will have to provide configuration.
3. We will need model the path of directory were we have stored weights of our fine tuned models.

In [24]:
xlmr_fine_model = (XLMRobertaForTokenClassification.from_pretrained("./artifacts/model_weights", config=xlmr_config).to(device))

In [25]:
xlmr_fine_model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

Observer that the last two layers are the ones we have added

# Predictions

#### EXAMPLE 2

In [61]:
datapoint = panx_en_encoded["train"]["input_ids"][15]
actual_tags = panx_en_encoded["train"]["ner_tags_str"][15]
print(datapoint)
tokenized_form = xlmr_tokenizer.convert_ids_to_tokens(datapoint)
actual_form = xlmr_tokenizer.convert_tokens_to_string(tokenized_form)

print(f"Tokenized form \n {tokenized_form}")
print(f"Length of Tokenized form -> {len(tokenized_form)} \n")
print(f"Actual sentence \n {actual_form}")
print(f"Length of actual sentence -> {len(actual_form.split())+1} \n")
print(f"ACTUAL TAGS \n {actual_tags}")

[0, 54041, 24748, 36216, 6, 4, 51978, 111, 166207, 3956, 136, 147202, 46542, 2]
Tokenized form
 ['<s>', '▁Prince', '▁Albert', '▁Victor', '▁', ',', '▁Duke', '▁of', '▁Clare',
'nce', '▁and', '▁Avon', 'dale', '</s>']
Length of Tokenized form -> 14

Actual sentence
 <s> Prince Albert Victor , Duke of Clarence and Avondale</s>
Length of actual sentence -> 11

ACTUAL TAGS
 ['B-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER',
'I-PER']


In [62]:
# convert input_ids into tokens
data = torch.tensor(datapoint)
print(data)
data = data.reshape(1,-1)
print(data)

# applying predictions
# prediction without using fine tuned model
outputs = xlmr_fine_model(data.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"\nNumber of tokens in sequence: {len(data[0])}")
print(f"Shape of outputs: {outputs.shape}")

print(outputs)

print("\n\nPREDICTED TAGS")
pred_tags = [index2tag[i.item()] for i in predictions[0][1:-1]]
print(pred_tags)

print(f"ACTUAL TAGS \n {actual_tags}")


tensor([     0,  54041,  24748,  36216,      6,      4,  51978,    111, 166207,
          3956,    136, 147202,  46542,      2])
tensor([[     0,  54041,  24748,  36216,      6,      4,  51978,    111, 166207,
           3956,    136, 147202,  46542,      2]])

Number of tokens in sequence: 14
Shape of outputs: torch.Size([1, 14, 7])
tensor([[[-2.2180,  2.1255,  3.5564, -0.7727, -0.2437, -1.4792, -1.5153],
         [-2.0184,  6.7363,  1.1276,  1.1878, -2.6170, -0.6715, -3.0794],
         [-2.1819,  0.1121,  7.0130, -2.0592,  0.3661, -2.3843, -0.6578],
         [-2.1361, -0.0949,  7.0442, -2.1551,  0.3254, -2.4463, -0.5389],
         [-1.4781,  0.0678,  6.8940, -1.9208,  0.1178, -2.4237, -0.7508],
         [-1.5920,  0.3513,  6.9473, -1.9821, -0.0104, -2.4431, -0.8644],
         [-2.3219,  0.8577,  6.8792, -1.6826,  0.1418, -2.2356, -0.9913],
         [-1.8488, -0.0159,  6.9235, -2.0291,  0.4430, -2.2784, -0.5683],
         [-2.1409, -0.1114,  7.1632, -1.9624,  0.3735, -2.1941, -0.4957]

In [68]:
panx_en_encoded["train"]["labels"][15]

[-100, 1, 2, 2, 2, -100, 2, 2, 2, -100, 2, 2, -100, -100]

In [69]:
predictions[0]

tensor([2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')

#### EXAMPLE 3

In [None]:
text ='Hello my name is Akash soni'
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")

In [None]:
outputs = xlmr_model(input_ids.to(device)).logits

predictions = torch.argmax(outputs, dim=-1)
print(f"Shape of outputs: {outputs.shape}")

Shape of outputs: torch.Size([1, 10, 7])


In [None]:
data = [i.item() for i in predictions[0]]
data

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
# Model prediction
[index2tag[idx] for idx in data][1:-1]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

It is very simple to understand that model is not able to make any constructive predictions, so we will have to go for the fine tuning

In [None]:
# convert input_ids into tokens
data = torch.tensor(datapoint)
print(data)
data = data.reshape(1,-1)
print(data)

# applying predictions
# prediction without using fine tuned model
outputs = xlmr_fine_model(data.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"\nNumber of tokens in sequence: {len(data[0])}")
print(f"Shape of outputs: {outputs.shape}")

print(outputs)

print("\n\nPREDICTED TAGS")
pred_tags = [index2tag[i.item()] for i in predictions[0][1:-1]]
print(pred_tags)

print(f"ACTUAL TAGS \n {actual_tags}")


tensor([     0,  54041,  24748,  36216,      6,      4,  51978,    111, 166207,
          3956,    136, 147202,  46542,      2])
tensor([[     0,  54041,  24748,  36216,      6,      4,  51978,    111, 166207,
           3956,    136, 147202,  46542,      2]])

Number of tokens in sequence: 14
Shape of outputs: torch.Size([1, 14, 7])
tensor([[[-2.2180,  2.1255,  3.5564, -0.7727, -0.2437, -1.4792, -1.5153],
         [-2.0184,  6.7363,  1.1276,  1.1878, -2.6170, -0.6715, -3.0794],
         [-2.1819,  0.1121,  7.0130, -2.0592,  0.3661, -2.3843, -0.6578],
         [-2.1361, -0.0949,  7.0442, -2.1551,  0.3254, -2.4463, -0.5389],
         [-1.4781,  0.0678,  6.8940, -1.9208,  0.1178, -2.4237, -0.7508],
         [-1.5920,  0.3513,  6.9473, -1.9821, -0.0104, -2.4431, -0.8644],
         [-2.3219,  0.8577,  6.8792, -1.6826,  0.1418, -2.2356, -0.9913],
         [-1.8488, -0.0159,  6.9235, -2.0291,  0.4430, -2.2784, -0.5683],
         [-2.1409, -0.1114,  7.1632, -1.9624,  0.3735, -2.1941, -0.4957]

#### EXAMPLE 3

In [26]:
text ='Hello my name is Akash soni'
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")

In [None]:
outputs = xlmr_fine_model(input_ids.to(device)).logits

predictions = torch.argmax(outputs, dim=-1)
print(f"Shape of outputs: {outputs.shape}")

Shape of outputs: torch.Size([1, 10, 7])


In [None]:
# create integer tokens and attention mask from the sentence
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokenized_input

In [None]:
data = [i.item() for i in predictions[0]]
data

[1, 0, 0, 0, 0, 1, 1, 2, 2, 0]

In [None]:
# Model prediction
[index2tag[idx] for idx in data][1:-1]

['O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'I-PER', 'I-PER']

In above predictions threre is a problem, for every token string even if it is not starting with the underscore, we are labelling it. 

We can associate -100 to non underscores tokens and where ever we detct -100 we ignore the predicted label, so our predicion outputs will be aligned.

Let's do it....

In [39]:
text ='Alex went to Imax to watch RRR Movie '
tokenized_input = xlmr_tokenizer(text.split(), is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)
word_ids = tokenized_input.word_ids()
print(word_ids)

# convert input_ids into tokens
data = torch.tensor(tokenized_input['input_ids'])
data = data.reshape(1,-1)

outputs = xlmr_fine_model(data.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(predictions[0])

['<s>', '▁Alex', '▁went', '▁to', '▁I', 'max', '▁to', '▁watch', '▁R', 'RR',
'▁Movie', '</s>']
[None, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, None]
tensor([4, 1, 0, 0, 3, 4, 0, 0, 3, 4, 4, 0])


In [40]:
prediction = [i.item() for i in predictions[0]]
previous_word_idx = None
pred_ids = []

for idx, word_idx in enumerate(word_ids):
    if word_idx is None or word_idx == previous_word_idx:
        continue
    elif word_idx != previous_word_idx:
        pred_ids.append(prediction[idx])
    previous_word_idx = word_idx

pred_ids

[1, 0, 0, 3, 0, 0, 3, 4]

In [41]:
[index2tag[idx] for idx in pred_ids]

['B-PER', 'O', 'O', 'B-ORG', 'O', 'O', 'B-ORG', 'I-ORG']

In [41]:
# create integer tokens and attention mask from the sentence
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokenized_input

In [None]:
# getting word tokens from the integer tokens
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>',
 '▁*',
 'In',
 'duct',
 'ed',
 '▁into',
 '▁the',
 '▁United',
 '▁States',
 '▁Hockey',
 '▁Hall',
 '▁of',
 '▁Fam',
 'e',
 '▁in',
 '▁2015',
 '</s>']

In [None]:
# for each word token we will provide it with the word id.
word_ids = tokenized_input.word_ids()
word_ids

[None, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, None]