# Packages

In [3]:
import datasets
import numpy as np
from huggingface_hub import login

# Setting HF

In [4]:
# Login to you hugging face account
# Hugging Face key
with open('../keys/hf_key.txt', 'r') as f:
    hf_key = f.read()
    login(token = hf_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\cs_bllin001\.cache\huggingface\token
Login successful


# Dataset

In [5]:
dataset = datasets.load_dataset('conll2003')

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

# Define news NER tags

In [7]:
pos_list = dataset["train"].features['pos_tags'].feature.names
chunk_list = dataset["train"].features['chunk_tags'].feature.names
label_list = dataset["train"].features['ner_tags'].feature.names

In [8]:
new_tags = ['B-ATTR', 'I-ATTR', 'B-ACT', 'I-ACT']
label_list = label_list + new_tags
label_list

['O',
 'B-PER',
 'I-PER',
 'B-ORG',
 'I-ORG',
 'B-LOC',
 'I-LOC',
 'B-MISC',
 'I-MISC',
 'B-ATTR',
 'I-ATTR',
 'B-ACT',
 'I-ACT']

In [9]:
print('POS:', pos_list)
print('Chunk:', chunk_list)
print('NER:', label_list)

POS: ['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
Chunk: ['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP']
NER: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ATTR', 'I-ATTR', 'B-ACT', 'I-ACT']


# Attribute entity

## Creating rule

In [10]:
chunk_index = {chunk_list[i]:i for i in range(len(chunk_list))}
chunk_index

{'O': 0,
 'B-ADJP': 1,
 'I-ADJP': 2,
 'B-ADVP': 3,
 'I-ADVP': 4,
 'B-CONJP': 5,
 'I-CONJP': 6,
 'B-INTJ': 7,
 'I-INTJ': 8,
 'B-LST': 9,
 'I-LST': 10,
 'B-NP': 11,
 'I-NP': 12,
 'B-PP': 13,
 'I-PP': 14,
 'B-PRT': 15,
 'I-PRT': 16,
 'B-SBAR': 17,
 'I-SBAR': 18,
 'B-UCP': 19,
 'I-UCP': 20,
 'B-VP': 21,
 'I-VP': 22}

In [11]:
# Sample list of dictionaries
data = 'train'
key_to_filter = 'chunk_tags'
desired_values = [1,2]  # List of desired values

# Filter the list of dictionaries to keep items where any desired value is found
filtered_dataset = [item for item in dataset[data] if any(tag in item.get(key_to_filter, []) for tag in desired_values)]

filtered_dataset[0]

{'id': '4',
 'tokens': ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 'pos_tags': [22,
  27,
  21,
  35,
  12,
  22,
  22,
  27,
  16,
  21,
  22,
  22,
  38,
  15,
  22,
  24,
  20,
  37,
  21,
  15,
  24,
  16,
  15,
  22,
  15,
  12,
  16,
  21,
  38,
  17,
  7],
 'chunk_tags': [11,
  11,
  12,
  13,
  11,
  12,
  12,
  11,
  12,
  12,
  12,
  12,
  21,
  13,
  11,
  12,
  21,
  22,
  11,
  13,
  11,
  1,
  13,
  11,
  17,
  11,
  12,
  12,
  21,
  1,
  0],
 'ner_tags': [5,
  0,
  0,
  0,
  0,
  3,
  4,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

### One example

In [12]:
def sentence_diagram(example, pos_list, chunk_list, label_list):
    
    pos = [pos_list[i] for i in example["pos_tags"]]
    chunk = [chunk_list[i] for i in example["chunk_tags"]]
    labels = [label_list[i] for i in example["ner_tags"]]

    print(' '.join(example['tokens']))

    for token, pos, chunk, labels in zip(example['tokens'], pos, chunk, labels):
        print(f"{token:-<40} {pos:-<10} {chunk:-<10} {labels}")

In [13]:
example = filtered_dataset[0]
sentence_diagram(example, pos_list, chunk_list, label_list)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
Germany--------------------------------- NNP------- B-NP------ B-LOC
's-------------------------------------- POS------- B-NP------ O
representative-------------------------- NN-------- I-NP------ O
to-------------------------------------- TO-------- B-PP------ O
the------------------------------------- DT-------- B-NP------ O
European-------------------------------- NNP------- I-NP------ B-ORG
Union----------------------------------- NNP------- I-NP------ I-ORG
's-------------------------------------- POS------- B-NP------ O
veterinary------------------------------ JJ-------- I-NP------ O
committee------------------------------- NN-------- I-NP------ O
Werner---------------------------------- NNP------- I-NP------ B-PER
Zwingmann------------------------------- NNP------- I-N

#### Assign "ATTR" tag

##### Function

In [14]:
def apply_chunk_to_ner_mapping(example, chunk_to_ner_mapping, label_all_tokens = True):
    """
    Modify the "ner" variable in the example based on the chunk_to_ner_mapping.
    
    Args:
        example (dict): The example dictionary containing "chunk_tags" and "ner_tags".
        chunk_to_ner_mapping (dict): A mapping from chunk tags to ner tags.
        
    Returns:
        dict: The modified example with updated "ner_tags".
    """
    for i in range(len(example['tokens'])):
        chunk = example['chunk_tags'][i]
        if chunk in chunk_to_ner_mapping:
            example['ner_tags'][i] = chunk_to_ner_mapping[chunk]
    
    return example

##### Applied

In [15]:
# Usage example:
chunk_to_ner_mapping = {
    1: 9,  # "B-ADJP": "B-ATR"
    2: 10  # "I-ADJP": "I-ATR"
}

In [16]:
# Modify the "ner_tags" in the example using the mapping
modifed_example = apply_chunk_to_ner_mapping(example, chunk_to_ner_mapping)

# Print the modified example
sentence_diagram(modifed_example, pos_list, chunk_list, label_list)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
Germany--------------------------------- NNP------- B-NP------ B-LOC
's-------------------------------------- POS------- B-NP------ O
representative-------------------------- NN-------- I-NP------ O
to-------------------------------------- TO-------- B-PP------ O
the------------------------------------- DT-------- B-NP------ O
European-------------------------------- NNP------- I-NP------ B-ORG
Union----------------------------------- NNP------- I-NP------ I-ORG
's-------------------------------------- POS------- B-NP------ O
veterinary------------------------------ JJ-------- I-NP------ O
committee------------------------------- NN-------- I-NP------ O
Werner---------------------------------- NNP------- I-NP------ B-PER
Zwingmann------------------------------- NNP------- I-N

## Assign "ATR" entity to the entire dataset

In [18]:
def apply_mapping_to_split(dataset_split,chunk_to_ner_mapping):
    modified_examples = {
        'id': [],
        'tokens': [],
        'pos_tags': [],
        'chunk_tags': [],
        'ner_tags': []
    }
    
    for example in dataset_split:
        modified_example = apply_chunk_to_ner_mapping(example, chunk_to_ner_mapping)
        
        # Append the modified example to the respective column
        modified_examples['id'].append(modified_example['id'])
        modified_examples['tokens'].append(modified_example['tokens'])
        modified_examples['pos_tags'].append(modified_example['pos_tags'])
        modified_examples['chunk_tags'].append(modified_example['chunk_tags'])
        modified_examples['ner_tags'].append(modified_example['ner_tags'])
    
    # Convert the modified examples to a dataset and return it
    modified_dataset = datasets.Dataset.from_dict(modified_examples)
    return modified_dataset


In [19]:
# Apply the mapping to each split in the DatasetDict

chunk_to_ner_mapping = {
    1: 9,  # "B-ADJP": "B-ATR"
    2: 10  # "I-ADJP": "I-ATR"
}

# Create a new DatasetDict with the concatenated splits
dataset = datasets.DatasetDict({
    'train': apply_mapping_to_split(dataset['train'], chunk_to_ner_mapping),
    'validation': apply_mapping_to_split(dataset['validation'], chunk_to_ner_mapping),
    'test': apply_mapping_to_split(dataset['test'], chunk_to_ner_mapping)
})
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [20]:
sentence_diagram(dataset['train'][16], pos_list, chunk_list, label_list)

" What we have to be extremely careful of is how other countries are going to take Germany 's lead , " Welsh National Farmers ' Union ( NFU ) chairman John Lloyd Jones said on BBC radio .
"--------------------------------------- "--------- O--------- O
What------------------------------------ WP-------- B-NP------ O
we-------------------------------------- PRP------- B-NP------ O
have------------------------------------ VBP------- B-VP------ O
to-------------------------------------- TO-------- I-VP------ O
be-------------------------------------- VB-------- I-VP------ O
extremely------------------------------- RB-------- B-ADJP---- B-ATTR
careful--------------------------------- JJ-------- I-ADJP---- I-ATTR
of-------------------------------------- IN-------- B-PP------ O
is-------------------------------------- VBZ------- B-VP------ O
how------------------------------------- WRB------- B-ADVP---- O
other----------------------------------- JJ-------- B-NP------ O
countries-------------

# Action entity

## Creating rule

In [21]:
chunk_index = {chunk_list[i]:i for i in range(len(chunk_list))}
chunk_index

{'O': 0,
 'B-ADJP': 1,
 'I-ADJP': 2,
 'B-ADVP': 3,
 'I-ADVP': 4,
 'B-CONJP': 5,
 'I-CONJP': 6,
 'B-INTJ': 7,
 'I-INTJ': 8,
 'B-LST': 9,
 'I-LST': 10,
 'B-NP': 11,
 'I-NP': 12,
 'B-PP': 13,
 'I-PP': 14,
 'B-PRT': 15,
 'I-PRT': 16,
 'B-SBAR': 17,
 'I-SBAR': 18,
 'B-UCP': 19,
 'I-UCP': 20,
 'B-VP': 21,
 'I-VP': 22}

In [22]:
# Sample list of dictionaries
data = 'train'
key_to_filter = 'chunk_tags'
desired_values = [21,22]  # List of desired values

# Filter the list of dictionaries to keep items where any desired value is found
filtered_dataset = [item for item in dataset[data] if any(tag in item.get(key_to_filter, []) for tag in desired_values)]

filtered_dataset[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

### One example

In [23]:
def sentence_diagram(example, pos_list, chunk_list, label_list):
    
    pos = [pos_list[i] for i in example["pos_tags"]]
    chunk = [chunk_list[i] for i in example["chunk_tags"]]
    labels = [label_list[i] for i in example["ner_tags"]]

    print(' '.join(example['tokens']))

    for token, pos, chunk, labels in zip(example['tokens'], pos, chunk, labels):
        print(f"{token:-<40} {pos:-<10} {chunk:-<10} {labels}")

In [24]:
example = filtered_dataset[0]
sentence_diagram(example, pos_list, chunk_list, label_list)

EU rejects German call to boycott British lamb .
EU-------------------------------------- NNP------- B-NP------ B-ORG
rejects--------------------------------- VBZ------- B-VP------ O
German---------------------------------- JJ-------- B-NP------ B-MISC
call------------------------------------ NN-------- I-NP------ O
to-------------------------------------- TO-------- B-VP------ O
boycott--------------------------------- VB-------- I-VP------ O
British--------------------------------- JJ-------- B-NP------ B-MISC
lamb------------------------------------ NN-------- I-NP------ O
.--------------------------------------- .--------- O--------- O


#### Assign "ACT" tag

##### Function

In [25]:
def apply_chunk_to_ner_mapping(example, chunk_to_ner_mapping, label_all_tokens = True):
    """
    Modify the "ner" variable in the example based on the chunk_to_ner_mapping.
    
    Args:
        example (dict): The example dictionary containing "chunk_tags" and "ner_tags".
        chunk_to_ner_mapping (dict): A mapping from chunk tags to ner tags.
        
    Returns:
        dict: The modified example with updated "ner_tags".
    """
    for i in range(len(example['tokens'])):
        chunk = example['chunk_tags'][i]
        if chunk in chunk_to_ner_mapping:
            example['ner_tags'][i] = chunk_to_ner_mapping[chunk]
    
    return example

##### Applied

In [26]:
# Usage example:
chunk_to_ner_mapping = {
    21: 11,  # "B-VP": "B-ACT"
    22: 12  # "I-VP": "I-ACT"
}

In [27]:
# Modify the "ner_tags" in the example using the mapping
modifed_example = apply_chunk_to_ner_mapping(example, chunk_to_ner_mapping)

# Print the modified example
sentence_diagram(modifed_example, pos_list, chunk_list, label_list)

EU rejects German call to boycott British lamb .
EU-------------------------------------- NNP------- B-NP------ B-ORG
rejects--------------------------------- VBZ------- B-VP------ B-ACT
German---------------------------------- JJ-------- B-NP------ B-MISC
call------------------------------------ NN-------- I-NP------ O
to-------------------------------------- TO-------- B-VP------ B-ACT
boycott--------------------------------- VB-------- I-VP------ I-ACT
British--------------------------------- JJ-------- B-NP------ B-MISC
lamb------------------------------------ NN-------- I-NP------ O
.--------------------------------------- .--------- O--------- O


## Assign "ACT" entity to the entire dataset

In [29]:
def apply_mapping_to_split(dataset_split,chunk_to_ner_mapping):
    modified_examples = {
        'id': [],
        'tokens': [],
        'pos_tags': [],
        'chunk_tags': [],
        'ner_tags': []
    }
    
    for example in dataset_split:
        modified_example = apply_chunk_to_ner_mapping(example, chunk_to_ner_mapping)
        
        # Append the modified example to the respective column
        modified_examples['id'].append(modified_example['id'])
        modified_examples['tokens'].append(modified_example['tokens'])
        modified_examples['pos_tags'].append(modified_example['pos_tags'])
        modified_examples['chunk_tags'].append(modified_example['chunk_tags'])
        modified_examples['ner_tags'].append(modified_example['ner_tags'])
    
    # Convert the modified examples to a dataset and return it
    modified_dataset = datasets.Dataset.from_dict(modified_examples)
    return modified_dataset


In [30]:
# Apply the mapping to each split in the DatasetDict

chunk_to_ner_mapping = {
    21: 11,  # "B-VP": "B-ACT"
    22: 12  # "I-VP": "I-ACT"
}

# Create a new DatasetDict with the concatenated splits
modified_dataset = datasets.DatasetDict({
    'train': apply_mapping_to_split(dataset['train'], chunk_to_ner_mapping),
    'validation': apply_mapping_to_split(dataset['validation'], chunk_to_ner_mapping),
    'test': apply_mapping_to_split(dataset['test'], chunk_to_ner_mapping)
})
modified_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [32]:
sentence_diagram(modified_dataset['test'][16], pos_list, chunk_list, label_list)

Japan coach Shu Kamo said : ' ' The Syrian own goal proved lucky for us .
Japan----------------------------------- NNP------- B-NP------ B-LOC
coach----------------------------------- NN-------- I-NP------ O
Shu------------------------------------- NNP------- I-NP------ B-PER
Kamo------------------------------------ NNP------- I-NP------ I-PER
said------------------------------------ VBD------- B-VP------ B-ACT
:--------------------------------------- :--------- O--------- O
'--------------------------------------- ''-------- O--------- O
'--------------------------------------- POS------- B-NP------ O
The------------------------------------- DT-------- I-NP------ O
Syrian---------------------------------- JJ-------- I-NP------ B-MISC
own------------------------------------- JJ-------- I-NP------ O
goal------------------------------------ NN-------- I-NP------ O
proved---------------------------------- VBD------- B-VP------ B-ACT
lucky----------------------------------- JJ-------- B-AD

# Upload into HF

In [33]:
modified_dataset.push_to_hub('storymodelers/dataset-BERT-NER', private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/2.73k [00:00<?, ?B/s]