In [13]:
from datasets import load_dataset

class MasakhaNERDataset:
    def __init__(self, prompt="", src_lang='yor'):
        self.all_data = {}
        self.train_data = {}
        self.valid_data = {}
        self.test_data = {}
        self.prompt = prompt
        self.LANGS = [
            'bam', 'bbj', 'ewe', 'fon', 'hau', 'ibo', 'kin', 'lug', 'luo', 'mos',
            'nya', 'pcm', 'sna', 'swa', 'tsn', 'twi', 'wol', 'xho', 'yor', 'zul'
        ]
        self.LABELS = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-DATE", "I-DATE"]
        self.src_lang = src_lang
        self.load_data()
        
    def convert_ner_tags(self, ner_tags, to_labels=True):
        # Define the mapping from integer tags to string labels
        tag_to_label = {
            0: "O",
            1: "B-PER",
            2: "I-PER",
            3: "B-ORG",
            4: "I-ORG",
            5: "B-LOC",
            6: "I-LOC",
            7: "B-DATE",
            8: "I-DATE"
        }
    
        # Create the reverse mapping from string labels to integer tags
        label_to_tag = {label: tag for tag, label in tag_to_label.items()}
    
        if to_labels:
        # Convert integer tags to string labels
            return [tag_to_label[tag] for tag in ner_tags]
        else:
            # Convert string labels to integer tags
            return [label_to_tag[label] for label in ner_tags]

    def load_data(self):
        if self.src_lang not in self.LANGS:
            raise ValueError(f"Language '{self.src_lang}' is not supported.")
        
        dataset = load_dataset('masakhane/masakhaner2', self.src_lang)
        self.train_data = dataset['train']
        self.valid_data = dataset['validation']
        self.test_data = dataset['test']
        self.all_data = dataset

        print(f"Data loaded for language: {self.src_lang}")
        print(f"Training samples: {len(self.train_data)}")
        print(f"Validation samples: {len(self.valid_data)}")
        print(f"Test samples: {len(self.test_data)}")

In [14]:
from pprint import pprint
# Initialize the dataset for Yorùbá
dataset = MasakhaNERDataset(src_lang='yor')

# Access the training data
train_data = dataset.train_data

# Print the first training example
print(train_data[0]["ner_tags"])


Data loaded for language: yor
Training samples: 6876
Validation samples: 983
Test samples: 1964
[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [15]:
import transformers
print(transformers.__version__)

4.44.2


In [18]:
train_data_ner_tags = [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
converted_labels = dataset.convert_ner_tags(train_data_ner_tags, to_labels=True)
print("Converted to labels:", converted_labels)

# Convert back to integer tags
converted_tags = dataset.convert_ner_tags(converted_labels, to_labels=False)
print("Converted back to tags:", converted_tags)

Converted to labels: ['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Converted back to tags: [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
