In [1]:
!pip install -q datasets scikit-learn python-crfsuite spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
import json
import re

# from src.preprocess import *
import spacy

In [3]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
dk_data = dataset['validation']
test_data = dataset['test']

nlp = spacy.load("en_core_web_sm")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.12k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1753 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1366 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2505 [00:00<?, ? examples/s]

# utils

In [4]:
def count_subtokens(token, subtoken_list):
    count = 0
    for subtoken in subtoken_list:
        if subtoken in token:
            count += 1
    return count

def add_spacy_ner(sentence):
    doc = nlp(" ".join(token for token, _ in sentence))
    spacy_features = [{} for _ in sentence]  # Initialize empty feature dict for each token

    # Match spaCy tokens to the original tokens and assign NER tags
    spacy_index = 0
    for i, (token, _) in enumerate(sentence):
        while spacy_index < len(doc) and doc[spacy_index].idx < len(" ".join(sentence[i][0] for i in range(0, i + 1))):
            if doc[spacy_index].ent_type_:
                spacy_features[i]['spacy_ner_' + doc[spacy_index].ent_type_.lower()] = 1
            spacy_index += 1

    return spacy_features

def search_regex(word, pattern):
    return bool(re.search(pattern, word))


In [5]:
from collections import Counter

entity_name = [
    'Organization',
    'Observatory',
    'CelestialObject',
    'Event',
    'CelestialRegion',
    'Identifier'
]

ner_tags = ["B-"+entity for entity in entity_name] + ["I-"+entity for entity in entity_name]

tag_to_id = {"O" : len(ner_tags)}
for i in range(len(ner_tags)):
    tag_to_id[ner_tags[i]] = i

def process_entity_tag(data, ner_tags=ner_tags):
    """
    Process ner tags based on selected entities

    Args:
        data: (hugging face dateset).
        ner_tags (list): ner tags.

    Returns:
        processed_tags (List[List[str]]): list of processed ner tags, each element is a list of ner tags of a document
        ner_tokens (dic): key is ner tag, value is a list containing all tokens labeled as the tag
        text (str): original text
    """
    # create new ner tags
    processed_tags = []
    text = []
    ner_tokens = {}
    for n in ner_tags:
        ner_tokens[n] = []

    for n in range(len(data)):
        doc = " ".join(data[n]['tokens'])
        ner_copy = data[n]['ner_tags'].copy()
        for i, t in enumerate(ner_copy):
            # taget ner
            if t in ner_tags:
                ner_tokens[t].append(data[n]['tokens'][i])
            # redundant ner
            elif t != "O":
                ner_copy[i] = "O"

        processed_tags.append(ner_copy)
        text.append(doc)

    return (processed_tags, ner_tokens, text)

def find_frequent_subword(tokens, n_gram, top):
    subwords = []
    for t in tokens:
        if len(t)>=n_gram:
            subwords.extend([t[i:i+n_gram] for i in range(len(t)-n_gram+1)])
    counts = Counter(subwords)
    top_subwords = counts.most_common(top)
    return top_subwords

In [6]:
def preprocess_data(dataset, sample=None):
    processed_tags, _, __ = process_entity_tag(data=dataset)
    formatted_data = []
    if not sample:
        sample = len(dataset)
    for i in range(sample):
        item = dataset[i]
        tokens = item['tokens']
        sentence = list(zip(tokens, processed_tags[i]))
        formatted_data.append(sentence)
    return formatted_data

def word2features(sent, i, spacy_features, dk=None):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    features.update(spacy_features[i])

    # Domain Knowledge features
    if dk:
        for k, v in dk['sub_tokens'].items():
            features[k] = count_subtokens(word, v)
        for k, v in dk['regex'].items():
            features[k] = search_regex(word, v)

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"-1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"+1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['EOS'] = True

    return features

def process_sentence(sentence):
    spacy_features = add_spacy_ner(sentence)
    return [word2features(sentence, i, spacy_features) for i in range(len(sentence))], [label for token, label in sentence]

# generate features for dataset
def prepare_data(sentences, dk):
    X = []
    y = []
    for s in sentences:
        spacy_features = add_spacy_ner(s)
        X.append([word2features(s, i, spacy_features, dk) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

# UnBiased

## Data Preprocess

In [7]:
with open('unbiased_domain_knowledge.json') as json_file:
    unbiased_dk = json.load(json_file)

In [8]:
# Data preprocess
X_test, y_test = prepare_data(preprocess_data(test_data), unbiased_dk)

### Testing

In [14]:
import pandas

In [15]:
# Full
tagger = pycrfsuite.Tagger()
tagger.open(r'ner-model-unbiased-domain-full.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('unbiased-full.csv')
print(result)

{'B-CelestialObject': {'precision': 0.7655099894847529, 'recall': 0.4034358548074259, 'f1-score': 0.5283977499546363, 'support': 3609}, 'B-CelestialRegion': {'precision': 0.3181818181818182, 'recall': 0.10047846889952153, 'f1-score': 0.1527272727272727, 'support': 209}, 'B-Event': {'precision': 0.7272727272727273, 'recall': 0.2711864406779661, 'f1-score': 0.39506172839506176, 'support': 59}, 'B-Identifier': {'precision': 0.875, 'recall': 0.2722222222222222, 'f1-score': 0.4152542372881356, 'support': 180}, 'B-Observatory': {'precision': 0.8894977168949771, 'recall': 0.7345399698340875, 'f1-score': 0.8046261875258158, 'support': 1326}, 'B-Organization': {'precision': 0.9075022597167821, 'recall': 0.7927011141328186, 'f1-score': 0.8462258849971905, 'support': 11399}, 'I-CelestialObject': {'precision': 0.7870216306156406, 'recall': 0.44643699858423785, 'f1-score': 0.5697079193014152, 'support': 2119}, 'I-CelestialRegion': {'precision': 0.4330708661417323, 'recall': 0.13784461152882205, 'f1

In [16]:
# Half
tagger = pycrfsuite.Tagger()
tagger.open(r'ner-model-unbiased-domain-half.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('unbiased-half.csv')
print(result)

{'B-CelestialObject': {'precision': 0.7664057796508128, 'recall': 0.35272928789138264, 'f1-score': 0.4831119544592031, 'support': 3609}, 'B-CelestialRegion': {'precision': 0.2830188679245283, 'recall': 0.07177033492822966, 'f1-score': 0.11450381679389313, 'support': 209}, 'B-Event': {'precision': 0.35714285714285715, 'recall': 0.0847457627118644, 'f1-score': 0.136986301369863, 'support': 59}, 'B-Identifier': {'precision': 0.8431372549019608, 'recall': 0.2388888888888889, 'f1-score': 0.3722943722943723, 'support': 180}, 'B-Observatory': {'precision': 0.8989098116947473, 'recall': 0.6840120663650076, 'f1-score': 0.7768736616702355, 'support': 1326}, 'B-Organization': {'precision': 0.9115184910458832, 'recall': 0.7546275989121852, 'f1-score': 0.8256863121520445, 'support': 11399}, 'I-CelestialObject': {'precision': 0.8046948356807512, 'recall': 0.4044360547428032, 'f1-score': 0.5383165829145727, 'support': 2119}, 'I-CelestialRegion': {'precision': 0.3416666666666667, 'recall': 0.102756892

In [17]:
# 25%
tagger = pycrfsuite.Tagger()
tagger.open(r'ner-model-unbiased-domain-25.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('unbiased-25.csv')
print(result)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'B-CelestialObject': {'precision': 0.6924012158054711, 'recall': 0.315599889165974, 'f1-score': 0.4335744194899125, 'support': 3609}, 'B-CelestialRegion': {'precision': 0.11428571428571428, 'recall': 0.019138755980861243, 'f1-score': 0.032786885245901634, 'support': 209}, 'B-Event': {'precision': 0.6, 'recall': 0.05084745762711865, 'f1-score': 0.09375000000000001, 'support': 59}, 'B-Identifier': {'precision': 0.8032786885245902, 'recall': 0.2722222222222222, 'f1-score': 0.4066390041493776, 'support': 180}, 'B-Observatory': {'precision': 0.8450413223140496, 'recall': 0.6168929110105581, 'f1-score': 0.7131647776809067, 'support': 1326}, 'B-Organization': {'precision': 0.8964000463016553, 'recall': 0.6793578384068778, 'f1-score': 0.7729314302824634, 'support': 11399}, 'I-CelestialObject': {'precision': 0.7367088607594937, 'recall': 0.27465785747994337, 'f1-score': 0.4001375042970093, 'support': 2119}, 'I-CelestialRegion': {'precision': 0.175, 'recall': 0.017543859649122806, 'f1-score': 0

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# 10%
tagger = pycrfsuite.Tagger()
tagger.open(r'ner-model-unbiased-domain-10.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('unbiased-10.csv')
print(result)

{'B-CelestialObject': {'precision': 0.6571218795888399, 'recall': 0.24799113327791633, 'f1-score': 0.36008851337759, 'support': 3609}, 'B-CelestialRegion': {'precision': 0.12121212121212122, 'recall': 0.019138755980861243, 'f1-score': 0.03305785123966942, 'support': 209}, 'B-Event': {'precision': 1.0, 'recall': 0.03389830508474576, 'f1-score': 0.06557377049180328, 'support': 59}, 'B-Identifier': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 180}, 'B-Observatory': {'precision': 0.6781750924784217, 'recall': 0.41478129713423834, 'f1-score': 0.5147402901263454, 'support': 1326}, 'B-Organization': {'precision': 0.8628313324616664, 'recall': 0.636810246512852, 'f1-score': 0.7327882091661618, 'support': 11399}, 'I-CelestialObject': {'precision': 0.776049766718507, 'recall': 0.23548843794242566, 'f1-score': 0.3613323678493845, 'support': 2119}, 'I-CelestialRegion': {'precision': 0.1643835616438356, 'recall': 0.03007518796992481, 'f1-score': 0.05084745762711865, 'support': 399}