# Multilingual Named Entity Recognition

In [None]:
import pandas as pd
!pip install --upgrade datasets fsspec

In [None]:
from datasets import load_dataset
load_dataset("xtreme",name="PAN-X.de")

In [None]:
from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows))))

In [None]:
tags=panx_ch['de']['train'].features['ner_tags'].feature
tags

In [None]:
def create_tag_names(batch):
    return {'ner_tags_str':[tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de=panx_ch['de'].map(create_tag_names)

# Multilingual Transformers

In [None]:
from transformers import AutoTokenizer
bert_model_name='bert-base-cased'
xlmr_model_name='xlm-roberta-base'
bert_tokenizer=AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer=AutoTokenizer.from_pretrained(xlmr_model_name)

In [None]:
text='Jack Sparrow loves Tokyo!'
bert_tokens=bert_tokenizer(text).tokens()
xlmr_tokens=xlmr_tokenizer(text).tokens()

In [None]:
df=pd.DataFrame([bert_tokens,xlmr_tokens],
                index=['Bert','XLM-R'])
df

# SentencePiece Tokenizer

In [None]:
''.join(xlmr_tokens).replace('\u2581'," ")

# Creating a Custom Model for Token Classification

In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class=XLMRobertaConfig
    def __init__(self,config):
        super().__init__(config)
        self.num_labels=config.num_labels
        self.roberta=RobertaModel(config,add_pooling_layer=False)
        self.dropout=nn.Dropout(config.hidden_dropout_prob)
        self.classifier=nn.Linear(config.hidden_size,config.num_labels)
        self.init_weights()

    def forward(self,input_ids,attention_mask=None,token_type_ids=None,
                labels=None,**kwargs):
        outputs=self.roberta(input_ids,attention_mask=attention_mask,
                             token_type_ids=token_type_ids,**kwargs)
        sequence_output=self.dropout(outputs[0])
        logits=self.classifier(sequence_output)
        loss=None
        if labels is not None:
            loss_fct=nn.CrossEntropyLoss()
            loss=loss_fct(logits.view(-1,self.num_labels,labels.view(-1)))
        return TokenClassifierOutput(loss=loss,logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

# Loading a Custom Model

In [None]:
index2tag={idx: tag for idx,tag in enumerate(tags.names)}
tag2index={tag: idx for idx,tag in enumerate(tags.names)}

In [None]:
tags.names

In [None]:
from transformers import AutoConfig
xlmr_config=AutoConfig.from_pretrained(xlmr_model_name,
                                       num_labels=tags.num_classes,
                                       id2label=index2tag,label2id=tag2index)

In [None]:
import torch
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlmr_model=XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,
                                                            config=xlmr_config).to(device)

In [None]:
input_ids=xlmr_tokenizer.encode(text,return_tensors='pt')
pd.DataFrame([xlmr_tokens,input_ids[0].numpy()], index=['Tokens','Input IDs'])

In [None]:
outputs=xlmr_model(input_ids.to(device)).logits
predictions=torch.argmax(outputs,dim=-1)
print(f'Number of tokens in sequence: {len(predictions[0])}')
print(f'Shape of outputs tensor: {outputs.shape}')

In [None]:
preds=[tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens,preds],index=['Tokens','Tags'])