In [1]:
!pip install ipykernel

Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


**NER** -> Named Entity Recognition - information extraction method that locates and classifies entitities present in the unstructured text (document).<br>
eg: person, date, location etc.


**Benchmark Datasets**:
1. cOnll 2003
2. XTREME
3. crossNER

In [3]:
import pandas as pd

tokens = "Jeff Dean is a computer scientist at Google Adravind Facebook, Snapchat in California".split()
labels = ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-PER", "B-ORG", "I-ORG", "O", "B-LOC"]

df = pd.DataFrame(data=[tokens, labels], index=['tokens', 'labels'])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
tokens,Jeff,Dean,is,a,computer,scientist,at,Google,Adravind,"Facebook,",Snapchat,in,California
labels,B-PER,I-PER,O,O,O,O,O,B-PER,B-ORG,I-ORG,O,B-LOC,


Dataset Utilized: XTREME (Cross-Lingual TRansfer Evalation of Multilingual Encoders)

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [7]:
from datasets import get_dataset_config_names
xtreme_subsets = get_dataset_config_names("xtreme")

In [8]:
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [9]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [10]:
len(panx_subsets)

40

In [11]:
from datasets import load_dataset

load_dataset("xtreme", name="PAN-X.en")

train-00000-of-00001.parquet:   0%|          | 0.00/942k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/472k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/472k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [12]:
en = load_dataset("xtreme", name="PAN-X.en")

In [13]:
en.keys()

dict_keys(['train', 'validation', 'test'])

In [14]:
en_train = en["train"]

In [15]:
en_train

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 20000
})

In [16]:
en_train[42]

{'tokens': ['File', ':', 'CCStaClara.JPG|Through', 'Santa', 'Clara'],
 'ner_tags': [0, 0, 0, 5, 6],
 'langs': ['en', 'en', 'en', 'en', 'en']}

In [19]:
pd.DataFrame(en["train"][0]).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,R.H.,Saunders,(,St.,Lawrence,River,),(,968,MW,)
ner_tags,3,4,0,3,4,4,0,0,0,0,0
langs,en,en,en,en,en,en,en,en,en,en,en


In [21]:
for key, value in en["train"].features.items():
    print(f"{key}: {value}\n")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)



In [26]:
from collections import Counter
from collections import defaultdict
from datasets import DatasetDict

split2freqs = defaultdict(Counter)
for split, dataset in en.items():
    for row in dataset["ner_tags"]:
        for tag_id in row:
            tag = dataset.features["ner_tags"].feature.int2str(tag_id)
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ORG,PER,LOC
train,9422,9164,9345
validation,4677,4635,4834
test,4745,4556,4657


In [27]:
# XLM-Roberta Model

from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [32]:
 text = "Jack Sparrow Loves New York!"
 bert_tokens = bert_tokenizer(text).tokens()
 xlmr_tokens = xlmr_tokenizer(text).tokens()

In [33]:
df = pd.DataFrame(data=[bert_tokens, xlmr_tokens], index = ["BERT", "XLM-R"])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
BERT,[CLS],Jack,Spa,##rrow,Loves,New,York,!,[SEP],
XLM-R,<s>,▁Jack,▁Spar,row,▁Love,s,▁New,▁York,!,</s>


In [35]:
# Custom Model for Token Classification

import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)



In [36]:
index2tag = {idx: tag for idx, tag in enumerate(en["train"].features["ner_tags"].feature.names)}
tag2index = {tag: idx for idx, tag in enumerate(en["train"].features["ner_tags"].feature.names)}

In [38]:
from transformers import AutoConfig

xlmr_model_name = "xlm-roberta-base"
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=len(index2tag))

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "out

In [39]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config)
xlmr_model.to(device)

cpu


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobe

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
        

In [40]:
input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index = ["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁Love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,11954,7,2356,5753,38,2


In [44]:
outputs = xlmr_model(input_ids)
predictions = torch.argmax(outputs.logits, dim=-1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of Output: {outputs.logits.shape}")

Number of tokens in sequence: 10
Shape of Output: torch.Size([1, 10, 7])


In [45]:
# creating a function to perform all operations

def tag_text(text, tages, model, tokenizer):
    tokens = tokenizer(text).config
    input_ids = xlmr.tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim = 2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])