In [10]:
import datasets

class MasakhaNERDataset:
    def __init__(self,sample_size):
        self.all_data = {}
        self.train_data = {}
        self.valid_data = {}
        self.test_data = {}
        self.sample_size=sample_size
        self.LANGS = [
            'bam', 'bbj', 'ewe', 'fon', 'hau', 'ibo', 'kin', 'lug', 'luo', 'mos',
            'nya', 'pcm', 'sna', 'swa', 'tsn', 'twi', 'wol', 'xho', 'yor', 'zul'
        ]
        self.LABELS = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-DATE", "I-DATE"]
        self.load_data()
        
    def convert_ner_tags(self, ner_tags, to_labels=True):
        # Define the mapping from integer tags to string labels
        tag_to_label = {
            0: "O",
            1: "B-PER",
            2: "I-PER",
            3: "B-ORG",
            4: "I-ORG",
            5: "B-LOC",
            6: "I-LOC",
            7: "B-DATE",
            8: "I-DATE"
        }
    
        # Create the reverse mapping from string labels to integer tags
        label_to_tag = {label: tag for tag, label in tag_to_label.items()}
    
        if to_labels:
        # Convert integer tags to string labels
            return [tag_to_label[tag] for tag in ner_tags]
        else:
            # Convert string labels to integer tags
            return [label_to_tag[label] for label in ner_tags]

    def load_data(self):
            for lang in self.LANGS:
                dataset = datasets.load_dataset('masakhane/masakhaner2', lang)
                # Load samples based on sample_size argument
                if self.sample_size>0:
                    self.train_data[lang] = dataset['train'].select(range(min(self.sample_size, len(dataset['train']))))
                    self.valid_data[lang] = dataset['validation'].select(range(min(self.sample_size, len(dataset['validation']))))
                    self.test_data[lang] = dataset['test'].select(range(min(self.sample_size, len(dataset['test']))))
                else:
                    self.train_data[lang] = dataset['train']
                    self.valid_data[lang] = dataset['validation']
                    self.test_data[lang] = dataset['test']
            self.all_data = dataset
            print(f"Total training samples: {len(self.train_data)}")
            print(f"Total validation samples: {len(self.valid_data)}")
            print(f"Total test samples: {len(self.test_data)}")


In [12]:
from pprint import pprint
# Initialize the dataset for Yorùbá
dataset = MasakhaNERDataset(sample_size=2)

# Access the training data
train_data = dataset.train_data

for lang in dataset.LANGS:
        print(f"Processing language: {lang}")

        # Get train and test data
        train_data = dataset.train_data
        test_data = dataset.test_data

        train_tokens = [sample[lang]['tokens'] for sample in train_data]


Total training samples: 20
Total validation samples: 20
Total test samples: 20
Processing language: bam


TypeError: string indices must be integers, not 'str'

In [15]:
import transformers
print(transformers.__version__)

4.44.2


In [18]:
train_data_ner_tags = [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
converted_labels = dataset.convert_ner_tags(train_data_ner_tags, to_labels=True)
print("Converted to labels:", converted_labels)

# Convert back to integer tags
converted_tags = dataset.convert_ner_tags(converted_labels, to_labels=False)
print("Converted back to tags:", converted_tags)

Converted to labels: ['O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Converted back to tags: [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [6]:
def process_model_output(output, num_tokens):
    pred_labels = output.strip().split()
    # Handle mismatch in the number of tokens and predicted labels
    if len(pred_labels) < num_tokens:
        # Pad with 'O'
        pred_labels.extend(['O'] * (num_tokens - len(pred_labels)))
    elif len(pred_labels) > num_tokens:
        # Truncate to match the number of tokens
        pred_labels = pred_labels[:num_tokens]
    return pred_labels
process_model_output("O",10)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']