## Load and preprocess data

In [1]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
dataset['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [4]:
ner_tags= {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [5]:
# Swap keys and values using dictionary comprehension
swapped_dict = {v: k for k, v in ner_tags.items()}

# Print the swapped dictionary
print(swapped_dict)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [6]:
[swapped_dict[x] for x in dataset['train'][0]['ner_tags']]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [7]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [8]:
def label_tokens(entry):
    entry['ner_labels'] = [swapped_dict[x] for x in entry['ner_tags']]
    return entry


In [9]:
dataset['train'] = dataset["train"].map(label_tokens)
dataset['test'] = dataset["test"].map(label_tokens)
dataset['validation'] = dataset["validation"].map(label_tokens)


In [10]:
def tokens_to_sentence(entry):
    entry['sentence'] = ' '.join(entry['tokens'])
    return entry

dataset['train'] = dataset["train"].map(tokens_to_sentence)
dataset['test'] = dataset["test"].map(tokens_to_sentence)
dataset['validation'] = dataset["validation"].map(tokens_to_sentence)


In [11]:
def extract_entities(entry):
    entities = {'PER': [], 'ORG': [], 'LOC': [], 'MISC': []}
    current_entity = {"type": None, "words": []}
    for word, label in zip(entry['sentence'].split(), entry['ner_labels']):
        if label.startswith('B-'):
            entity_type = label.split('-')[1]
            if current_entity["type"] == entity_type:
                entities[entity_type].append(' '.join(current_entity["words"]))
                current_entity["words"] = [word]
            else:
                if current_entity["type"] is not None:
                    entities[current_entity["type"]].append(' '.join(current_entity["words"]))
                current_entity = {"type": entity_type, "words": [word]}
        elif label.startswith('I-'):
            if current_entity["type"] is not None:
                current_entity["words"].append(word)
        else:
            if current_entity["type"] is not None:
                entities[current_entity["type"]].append(' '.join(current_entity["words"]))
            current_entity = {"type": None, "words": []}
    if current_entity["type"] is not None:
        entities[current_entity["type"]].append(' '.join(current_entity["words"]))

    entry['entities'] = entities
    return entry

# Extract entities
dataset['train'] = dataset["train"].map(extract_entities)
dataset['test'] = dataset["test"].map(extract_entities)
dataset['validation'] = dataset["validation"].map(extract_entities)



In [12]:
dataset['train'][10]['sentence'], dataset['train'][10]['entities']

('Spanish Farm Minister Loyola de Palacio had earlier accused Fischler at an EU farm ministers \' meeting of causing unjustified alarm through " dangerous generalisation . "',
 {'LOC': [],
  'MISC': ['Spanish'],
  'ORG': ['EU'],
  'PER': ['Loyola de Palacio', 'Fischler']})

In [13]:
"""import random

# Get the length of the training split
train_length = len(dataset['train'])
test_length  = len(dataset['test'])

# Sample 100 random indices
random.seed(42)
train_random_indices = random.sample(range(train_length), 1000)

random.seed(42)
test_random_indices = random.sample(range(test_length), 100)

# Retrieve the sampled entries
#train_sample = [dataset['train'][idx] for idx in train_random_indices]
test_sample = [dataset['test'][idx] for idx in test_random_indices]"""

# Take a sample 
train_sample = dataset["train"].shuffle(seed=42).select(range(1000))
test_sample = dataset["test"].shuffle(seed=42).select(range(100))
val_sample = dataset["validation"].shuffle(seed=42).select(range(100))


In [14]:
train_sample

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_labels', 'sentence', 'entities'],
    num_rows: 1000
})

In [15]:
test_sample

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_labels', 'sentence', 'entities'],
    num_rows: 100
})

In [16]:
from collections import Counter

def get_count(entries):
    # Initialize counters for each entity type
    per_counter = Counter()
    org_counter = Counter()
    loc_counter = Counter()
    misc_counter = Counter()

    # Count the occurrences of each type of entity
    for item in entries:
        per_counter.update(item['entities']['PER'])
        org_counter.update(item['entities']['ORG'])
        loc_counter.update(item['entities']['LOC'])
        misc_counter.update(item['entities']['MISC'])

    # Print the counts for each type of entity
    print("Total PER entities:", sum(per_counter.values()))
    print("Total ORG entities:", sum(org_counter.values()))
    print("Total LOC entities:", sum(loc_counter.values()))
    print("Total MISC entities:", sum(misc_counter.values()))


In [17]:
get_count(train_sample)

Total PER entities: 445
Total ORG entities: 437
Total LOC entities: 516
Total MISC entities: 258


In [18]:
get_count(test_sample)

Total PER entities: 68
Total ORG entities: 45
Total LOC entities: 60
Total MISC entities: 26


In [24]:
get_count(val_sample)

Total PER entities: 55
Total ORG entities: 36
Total LOC entities: 62
Total MISC entities: 24


In [19]:
# Save in JSON Lines format
for split, dataset in dataset.items():
    dataset.to_json(f"data/my-conll2003-dataset-{split}.jsonl")

Creating json from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 98.39ba/s]
Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 96.43ba/s]
Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 106.56ba/s]


In [20]:
train_sample

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_labels', 'sentence', 'entities'],
    num_rows: 1000
})

In [21]:
# Save in JSON Lines format
train_sample.to_json(f"data/my-conll2003-dataset-train_sample.jsonl")


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 27.49ba/s]


528287

In [22]:
test_sample.to_json(f"data/my-conll2003-dataset-test_sample.jsonl")


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 164.65ba/s]


54029

In [23]:
val_sample.to_json(f"data/my-conll2003-dataset-val_sample.jsonl")


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 156.17ba/s]


54846