In [None]:
!pip install datasets

In [57]:
from collections import Counter, defaultdict
from datasets import get_dataset_config_names, load_dataset
from datasets import DatasetDict

import pandas as pd

Dataset XTREME to evaluate standard paradigms in NLP by Google-Research:

https://github.com/google-research/xtreme/blob/master/README.md

For NER task we need PAN-X subset. it contains atricles annotated with labels like LOC, PER, ORG in IOB2 format.

In [3]:
subsets = get_dataset_config_names("xtreme")

Downloading builder script:   0%|          | 0.00/9.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

In [6]:
# check number of configuration of the dataset
len(subsets)

183

In [11]:
subsets[-5:]

['udpos.Thai',
 'udpos.Turkish',
 'udpos.Urdu',
 'udpos.Vietnamese',
 'udpos.Yoruba']

In [20]:
pan_sets = [x for x in subsets if x.lower().startswith("pan")]
len(pan_sets), pan_sets[::5]

(40,
 ['PAN-X.af',
  'PAN-X.el',
  'PAN-X.fa',
  'PAN-X.hu',
  'PAN-X.ka',
  'PAN-X.ms',
  'PAN-X.sw',
  'PAN-X.tr'])

In [21]:
dataset = load_dataset("xtreme", name='PAN-X.tr')
dataset

Downloading and preparing dataset xtreme/PAN-X.tr (download: 223.17 MiB, generated: 7.25 MiB, post-processed: Unknown size, total: 230.42 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.tr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.tr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [22]:
langs = ["de", "en", "fr", "it"]
fractions = [0.629, 0.059, 0.229, 0.084]

In [29]:
panx = defaultdict(DatasetDict)
panx

defaultdict(datasets.dataset_dict.DatasetDict, {})

In [31]:
for lang, frac in zip(langs, fractions):
    data = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in data:
        panx[lang][split] = (
            data[split].shuffle(seed=42).select(range(int(frac*data[split].num_rows)))
        )

pd.DataFrame({lang: [panx[lang]["train"].num_rows] for lang in langs},
             index=["Count"])

Reusing dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-f522d4a37ddd3e20.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-4f6c26e9ec092e78.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-f580af432548c8ec.arrow
Reusing dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-222b8e7762533528.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-a54caf20dbd52b70.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-e4394e42ca877189.arrow
Reusing dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-38d4346c12a33c0c.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-1ff05c3c3e40c83f.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-8a25bc3c18ffdad0.arrow
Reusing dataset xtreme (/root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-9074c181e0917e20.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-ceb89343d1588e40.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e/cache-02527b8c43081a5e.arrow


Unnamed: 0,de,en,fr,it
Count,12580,1180,4580,1680


In [35]:
element = panx["de"]["train"][0]
element

{'langs': ['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de'],
 'ner_tags': [3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'tokens': ['Olympique',
  'Nîmes',
  ',',
  'Auxerres',
  'seinerzeitiger',
  'drittklassiger',
  'Endspielgegner',
  ',',
  'hatte',
  'sich',
  'erst',
  'gar',
  'nicht',
  'für',
  'die',
  'Hauptrunde',
  'qualifizieren',
  'können',
  '.']}

In [36]:
element.items()

dict_items([('tokens', ['Olympique', 'Nîmes', ',', 'Auxerres', 'seinerzeitiger', 'drittklassiger', 'Endspielgegner', ',', 'hatte', 'sich', 'erst', 'gar', 'nicht', 'für', 'die', 'Hauptrunde', 'qualifizieren', 'können', '.']), ('ner_tags', [3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ('langs', ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de'])])

In [39]:
panx["de"]["train"].features

{'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [51]:
panx["de"]["train"].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [52]:
ner_tags = panx["de"]["train"].features['ner_tags'].feature

In [55]:
# ClassLabel.int2str - to get string value for integer ner_tag

def create_tag_names(batch):
    return {"ner_tags_str": [ner_tags.int2str(idx) for idx in batch["ner_tags"]]}

In [56]:
# create additional column in the training data with string representation of the tags

panx_de = panx["de"].map(create_tag_names)
de_element = panx_de["train"][0]
pd.DataFrame([de_element["tokens"], de_element["ner_tags_str"]],
             ["tokens", "tags"])

  0%|          | 0/12580 [00:00<?, ?ex/s]

  0%|          | 0/6290 [00:00<?, ?ex/s]

  0%|          | 0/6290 [00:00<?, ?ex/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
tokens,Olympique,Nîmes,",",Auxerres,seinerzeitiger,drittklassiger,Endspielgegner,",",hatte,sich,erst,gar,nicht,für,die,Hauptrunde,qualifizieren,können,.
tags,B-ORG,I-ORG,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [61]:
# check frequency of each label in the dataset to avoid imbalanced situation

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient="index")


Unnamed: 0,ORG,PER,LOC
train,5397,5881,6169
validation,2639,2870,3172
test,2657,2971,3100


Labels are balanced enough accross splits. Move to Tokenization.
