# 多言語の固有表現認識

* ゼロショット異言語間転移
* XLM-RoBERTa

## データセット

* XTREME: Cross-lingual TRansfer Evaluation of Multilingual Encoders ベンチマーク
* WikiANN または PAN-X
* 多言語のWikipedia記事
* LOC（場所）、PER（人名）、ORG（組織名）でアノテーションされている
* `B-` 固有表現の先頭
* `I-` 固有表現に属する連続したトークン
* `O` どの固有表現にも属さない

In [None]:
# データセットのconfigを取得
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names("xtreme")
len(xtreme_subsets)

In [None]:
# PAN-Xに絞り込み
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

In [None]:
# ドイツ語のコーパスをロード
from datasets import load_dataset

load_dataset("xtreme", name="PAN-X.de")

In [None]:
# スイスコーパスを模倣するためにドイツ語、フランス語、イタリア語、英語のコーパスを話者の比率でサンプリング
from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # 単言語コーパスをダウンロード
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")

    # 分割をシャッフルし、話者の割合に応じてサンプリング
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows))))

In [None]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
             index=["Number of training examples"])

In [None]:
# ドイツ語コーパスのデータを確認
element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

In [None]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

In [None]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

In [None]:
tags.int2str(1)

In [None]:
# ner_tagsのIDを文字列に変換したner_tags_str列を新たに追加する
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)

In [None]:
# DatasetDictにner_tags_strが追加されている
panx_de

In [None]:
de_example = panx_de["train"][0]
print(de_example)

In [None]:
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
             ["Tokens", "Tags"])

In [None]:
# タグの数に不均衡がないか確認
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

## 多言語Transformer

* 事前学習に用いるコーパスが多言語の文書から構成されている
* 言語を区別する明示的な情報がなくても下流タスクに対して汎化できる

## XLM-RoBERTa（XLM-R）

* 100言語に対してマスク言語モデルのみで事前学習
* WikipediaのダンプデータとCommon Crawlデータを使って訓練
* トークン化にSentencePieceを使用

In [None]:
# BERTの WordPiece TokenizerとXML-RのSentencePiece Tokenizerを比較
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [None]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [None]:
bert_tokens

In [None]:
xlmr_tokens

## Transformerモデルクラスの詳細

In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [None]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # ボディ部分はRobertaと共通
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        # ヘッダを追加
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # ボディ部分の重みをロード
        # RobertaPreTrainModelのメソッドで訓練済み重みをロードできる
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                labels=None, **kwargs):
        # ボディの出力を得る
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)

        # ヘッドで分類
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)

        # Lossを計算
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states, 
                                     attentions=outputs.attentions)

In [None]:
tags.names

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
print(index2tag)
print(tag2index)

In [None]:
tags.num_classes

In [None]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag,
                                         label2id=tag2index)

In [None]:
xlmr_config

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)

In [None]:
text

In [None]:
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
input_ids

In [None]:
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

In [None]:
# それぞれのトークンに対する分類ラベル
outputs = xlmr_model(input_ids.to(device)).logits
outputs.shape

In [None]:
predictions = torch.argmax(outputs, dim=-1)
predictions.shape

In [None]:
# この時点ではヘッダが訓練されていないためランダムな出力
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["tokens", "Tags"])

In [None]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

## 固有表現抽出のためのテキストトークン化

In [None]:
print(de_example)

In [None]:
words = de_example["tokens"]
words

In [None]:
labels = de_example["ner_tags"]
labels

In [None]:
# is_split_into_words=Trueは入力はすでに単語に分割されていることを示す
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokenized_input

In [None]:
# どういうトークンに分割されたか調べる
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

In [None]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

In [None]:
# ▁がついていない単語は前のトークンの従属トークンのためラベル推定が不要なのでマスクする
# マスクには -100 を使う
# -100 は nn.CrossEntropyLoss の ignore_index の値
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

In [None]:
print(label_ids)

In [None]:
# ▁がついたトークンのみ正解ラベルが割り振られていることがわかる
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

In [None]:
def tokenize_and_align_labels(examples):
    # input_idsとattention_maskが追加される
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, 
                                      is_split_into_words=True)

    # ラベルを割り当てる
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [None]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=["langs", "ner_tags", "tokens"])

In [None]:
panx_ch["de"]

In [None]:
panx_de_encoded = encode_panx_dataset(panx_ch["de"])
panx_de_encoded

In [None]:
print(panx_de_encoded["train"][0]["input_ids"])
print(panx_de_encoded["train"][0]["attention_mask"])
print(panx_de_encoded["train"][0]["labels"])

## 精度指標

* 精度、再現率、F1スコアで評価する
* 固有表現を構成するすべてのトークンで正しく予測できている必要がある
* [seqeval](https://github.com/chakki-works/seqeval) というリポジトリが有用

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import classification_report

y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

print(classification_report(y_true, y_pred))

In [None]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

## XLM-RoBERTaのFine-tuning

* 現状はヘッド部分がランダムな重みの状態なのでFine-tuningが必要
* ドイツ語でFine-tuningする
* フランス語、イタリア語、英語でのゼロショット言語間転移性能を評価する

In [None]:
from transformers import TrainingArguments

num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
    output_dir=model_name,
    log_level="error", 
    num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch", 
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False, 
    logging_steps=logging_steps,
    push_to_hub=True)

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
# 新しいモデルを返す
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [None]:
# 新しいバージョンではエラーが出るので環境変数を設定
%env TOKENIZERS_PARALLELISM=falsea

In [None]:
from transformers import Trainer

trainer = Trainer(model_init=model_init,
                  args=training_args, 
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded["train"],
                  eval_dataset=panx_de_encoded["validation"], 
                  tokenizer=xlmr_tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)

## エラー分析

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Convert dict of lists to list of dicts suitable for data collator
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    # Pad inputs and labels and put all tensors on device
    batch = data_collator(features)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    with torch.no_grad():
        # Pass data through model  
        output = trainer.model(input_ids, attention_mask)
        # Logit.size: [batch_size, sequence_length, classes]
        # Predict class with largest logit value on classes axis
        predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()
    # Calculate loss per token after flattening batch dimension with view
    loss = cross_entropy(output.logits.view(-1, 7), 
                         labels.view(-1), reduction="none")
    # Unflatten batch dimension and convert to numpy array
    loss = loss.view(len(input_ids), -1).cpu().numpy()

    return {"loss":loss, "predicted_label": predicted_label}

In [None]:
valid_set = panx_de_encoded["validation"]
valid_set

In [None]:
batch = valid_set[0:3]
print(batch)

In [None]:
features = [dict(zip(batch, t)) for t in zip(*batch.values())]
print(features)

In [None]:
batch = data_collator(features)
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)

In [None]:
output = trainer.model(input_ids, attention_mask)
# (batch, seqlen, class)
output["logits"].shape

In [None]:
predicted_label = torch.argmax(output.logits, axis=-1)
predicted_label

In [None]:
labels

In [None]:
# バッチと系列長をまとめる
output.logits.view(-1, 7).shape, labels.view(-1).shape

In [None]:
loss = cross_entropy(output.logits.view(-1, 7), labels.view(-1), reduction="none")
loss.shape

In [None]:
valid_set = panx_de_encoded["validation"]
valid_set = valid_set.map(forward_pass_with_label, batched=True, batch_size=32)
df = valid_set.to_pandas()

In [None]:
df

In [None]:
index2tag

In [None]:
index2tag[-100] = "IGN"
df["input_tokens"] = df["input_ids"].apply(lambda x: xlmr_tokenizer.convert_ids_to_tokens(x))
df["predicted_label"] = df["predicted_label"].apply(lambda x: [index2tag[i] for i in x])
df["labels"] = df["labels"].apply(lambda x: [index2tag[i] for i in x])
df['loss'] = df.apply(lambda x: x['loss'][:len(x['input_ids'])], axis=1)
df['predicted_label'] = df.apply(lambda x: x['predicted_label'][:len(x['input_ids'])], axis=1)
df.head(1)

In [None]:
df_tokens = df.apply(pd.Series.explode)
df_tokens = df_tokens.query("labels != 'IGN'")
df_tokens["loss"] = df_tokens["loss"].astype(float).round(2)
df_tokens.head(7)

In [None]:
(
    df_tokens.groupby("input_tokens")[["loss"]]
    .agg(["count", "mean", "sum"])
    .droplevel(level=0, axis=1)  # Get rid of multi-level columns
    .sort_values(by="sum", ascending=False)
    .reset_index()
    .round(2)
    .head(10)
    .T
)

In [None]:
(
    df_tokens.groupby("labels")[["loss"]] 
    .agg(["count", "mean", "sum"])
    .droplevel(level=0, axis=1)
    .sort_values(by="mean", ascending=False)
    .reset_index()
    .round(2)
    .T
)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

In [None]:
import matplotlib.pyplot as plt

plot_confusion_matrix(df_tokens["labels"], df_tokens["predicted_label"], tags.names)

## 言語間転移

In [None]:
def get_f1_score(trainer, dataset):
    return trainer.predict(dataset).metrics["test_f1"]

In [None]:
f1_scores = defaultdict(dict)
f1_scores["de"]["de"] = get_f1_score(trainer, panx_de_encoded["test"])
f1_scores["de"]["de"]

In [None]:
text_fr = "Jeff Dean est informaticien chez Google en Kalifornien"
tag_text(text_fr, tags, trainer.model, xlmr_tokenizer)

In [None]:
def evaluate_lang_performance(lang, trainer):
    panx_ds = encode_panx_dataset(panx_ch[lang])
    return get_f1_score(trainer, panx_ds["test"])

In [None]:
# de => frに転移した結果を評価
f1_scores["de"]["fr"] = evaluate_lang_performance("fr", trainer)
f1_scores["de"]["fr"]

In [None]:
f1_scores["de"]["it"] = evaluate_lang_performance("it", trainer)
f1_scores["de"]["it"]

In [None]:
f1_scores["de"]["en"] = evaluate_lang_performance("en", trainer)
f1_scores["de"]["en"]

In [None]:
# フランス語でスクラッチから訓練したときの精度はどう変わるか？
def train_on_subset(dataset, num_samples):
    train_ds = dataset["train"].shuffle(seed=42).select(range(num_samples))
    valid_ds = dataset["validation"]
    test_ds = dataset["test"]
    training_args.logging_step = len(train_ds) // batch_size
    
    trainer = Trainer(model_init=model_init,
                      args=training_args, 
                      data_collator=data_collator,
                      compute_metrics=compute_metrics,
                      train_dataset=train_ds,
                      eval_dataset=valid_ds,
                      tokenizer=xlmr_tokenizer)
    trainer.train()
    if training_args.push_to_hub:
        trainer.push_to_hub(commit_message="Training completed!")
    
    f1_score = get_f1_score(trainer, test_ds)
    return pd.DataFrame.from_dict({
        "num_samples": [len(train_ds)],
        "f1_score": [f1_score]})

In [None]:
panx_ch["fr"]

In [None]:
panx_fr_encoded = encode_panx_dataset(panx_ch["fr"])
panx_fr_encoded

In [None]:
training_args.push_to_hub = False
metrics_df = train_on_subset(panx_fr_encoded, 250)
metrics_df