In [1]:
from datasets import get_dataset_config_names
xtreme_subsets=get_dataset_config_names("xtreme")
print(f"XTREME gas {len(xtreme_subsets)} configurations")

Downloading builder script:   0%|          | 0.00/37.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/105k [00:00<?, ?B/s]

XTREME gas 183 configurations


In [2]:
panx_subsets=[s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [4]:
from datasets import load_dataset
load_dataset("xtreme", name="PAN-X.de")

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

###### Create dict with key=languaje and value= PAN-X corpus (type DasetDict) downloading fractions (fracs) of the languajes to use in the model (langs)

In [7]:
from collections import defaultdict
from datasets import DatasetDict

langs=["de", "fr", "it", "en"]
fracs=[0.629,0.229,0.084,0.059]
#return dataset if a key doesn't exist
panx_ch=defaultdict(DatasetDict)

for lang, frac in zip(langs,fracs):
    #load monolingual corpus
    ds=load_dataset("xtreme", name=f"PAN-X.{lang}")
    #shuffle and downsample each split according to spoken proportion 
    for split in ds:
        panx_ch[lang][split]=(
        ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows))))

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [10]:
import pandas as pd
pd.DataFrame({lang:[panx_ch[lang]["train"].num_rows] for lang in langs}, index=["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


###### Since german is the languaje with more elements is going to use it for base of zero-shot cross lingual transfer for the others

In [13]:
element=panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}:{value}")

tokens:['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags:[0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs:['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


###### check ner_tags values 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'

In [14]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}:{value}")

tokens:Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags:Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs:Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [15]:
tags=panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


###### dict to key=new column name value=class names

In [25]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [26]:
panx_de=panx_ch["de"].map(create_tag_names)

In [28]:
de_example=panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]], ["tokens","tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


###### check balance by calculate the frequency of tags

In [35]:
from collections import Counter
split2freqs=defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row: 
            if tag.startswith("B"):
                tag_type=tag.split("-")[1]
                split2freqs[split][tag_type] +=1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071
