In [1]:
!pip install -q datasets scikit-learn python-crfsuite spacy
!python -m spacy download en_core_web_sm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m

In [7]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
import json
import re

# from src.preprocess import *
import spacy

In [8]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
dk_data = dataset['validation']
test_data = dataset['test']

nlp = spacy.load("en_core_web_sm")

# utils

In [9]:
def count_subtokens(token, subtoken_list):
    count = 0
    for subtoken in subtoken_list:
        if subtoken in token:
            count += 1
    return count

def add_spacy_ner(sentence):
    doc = nlp(" ".join(token for token, _ in sentence))
    spacy_features = [{} for _ in sentence]  # Initialize empty feature dict for each token

    # Match spaCy tokens to the original tokens and assign NER tags
    spacy_index = 0
    for i, (token, _) in enumerate(sentence):
        while spacy_index < len(doc) and doc[spacy_index].idx < len(" ".join(sentence[i][0] for i in range(0, i + 1))):
            if doc[spacy_index].ent_type_:
                spacy_features[i]['spacy_ner_' + doc[spacy_index].ent_type_.lower()] = 1
            spacy_index += 1

    return spacy_features

def search_regex(word, pattern):
    return bool(re.search(pattern, word))


In [10]:
from collections import Counter

entity_name = [
    'Organization',
    'Observatory',
    'CelestialObject',
    'Event',
    'CelestialRegion',
    'Identifier'
]

ner_tags = ["B-"+entity for entity in entity_name] + ["I-"+entity for entity in entity_name]

tag_to_id = {"O" : len(ner_tags)}
for i in range(len(ner_tags)):
    tag_to_id[ner_tags[i]] = i

def process_entity_tag(data, ner_tags=ner_tags):
    """
    Process ner tags based on selected entities

    Args:
        data: (hugging face dateset).
        ner_tags (list): ner tags.

    Returns:
        processed_tags (List[List[str]]): list of processed ner tags, each element is a list of ner tags of a document
        ner_tokens (dic): key is ner tag, value is a list containing all tokens labeled as the tag
        text (str): original text
    """
    # create new ner tags
    processed_tags = []
    text = []
    ner_tokens = {}
    for n in ner_tags:
        ner_tokens[n] = []

    for n in range(len(data)):
        doc = " ".join(data[n]['tokens'])
        ner_copy = data[n]['ner_tags'].copy()
        for i, t in enumerate(ner_copy):
            # taget ner
            if t in ner_tags:
                ner_tokens[t].append(data[n]['tokens'][i])
            # redundant ner
            elif t != "O":
                ner_copy[i] = "O"

        processed_tags.append(ner_copy)
        text.append(doc)

    return (processed_tags, ner_tokens, text)

def find_frequent_subword(tokens, n_gram, top):
    subwords = []
    for t in tokens:
        if len(t)>=n_gram:
            subwords.extend([t[i:i+n_gram] for i in range(len(t)-n_gram+1)])
    counts = Counter(subwords)
    top_subwords = counts.most_common(top)
    return top_subwords

In [11]:
def preprocess_data(dataset, sample=None):
    processed_tags, _, __ = process_entity_tag(data=dataset)
    formatted_data = []
    if not sample:
        sample = len(dataset)
    for i in range(sample):
        item = dataset[i]
        tokens = item['tokens']
        sentence = list(zip(tokens, processed_tags[i]))
        formatted_data.append(sentence)
    return formatted_data

def word2features(sent, i, spacy_features, dk=None):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    features.update(spacy_features[i])

    # Domain Knowledge features
    if dk:
        for k, v in dk['sub_tokens'].items():
            features[k] = count_subtokens(word, v)
        for k, v in dk['regex'].items():
            features[k] = search_regex(word, v)

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"-1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"+1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['EOS'] = True

    return features

def process_sentence(sentence):
    spacy_features = add_spacy_ner(sentence)
    return [word2features(sentence, i, spacy_features) for i in range(len(sentence))], [label for token, label in sentence]

# generate features for dataset
def prepare_data(sentences, dk):
    X = []
    y = []
    for s in sentences:
        spacy_features = add_spacy_ner(s)
        X.append([word2features(s, i, spacy_features, dk) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

# Biased

## Data Preprocess

In [12]:
with open('biased_domain_knowledge.json') as json_file:
    unbiased_dk = json.load(json_file)

In [13]:
dk_200 = unbiased_dk['200']
dk_500 = unbiased_dk['500']
dk_1000 = unbiased_dk['1000']
dk_all = unbiased_dk['all']

In [14]:
# Data preprocess
# X_train, y_train = prepare_data(preprocess_data(train_data), dk_all)
X_test_full, y_test_full = prepare_data(preprocess_data(test_data), dk_all) # Full
X_test_half, y_test_half = prepare_data(preprocess_data(test_data, sample=1000), dk_1000) # half
X_test_25, y_test_25 = prepare_data(preprocess_data(test_data, sample=500), dk_500) # 25%
X_test_10, y_test_10 = prepare_data(preprocess_data(test_data, sample=200), dk_200) # 10%

### Testing

In [21]:
# Full
tagger = pycrfsuite.Tagger()
tagger.open(r'biased/ner-model-biased-domain-full.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_full]
y_test_flat = [item for sublist in y_test_full for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-full-full.csv')
print(result)

{'B-CelestialObject': {'precision': 0.7839276210750399, 'recall': 0.4081463009143807, 'f1-score': 0.5368075801749271, 'support': 3609}, 'B-CelestialRegion': {'precision': 0.2835820895522388, 'recall': 0.09090909090909091, 'f1-score': 0.13768115942028988, 'support': 209}, 'B-Event': {'precision': 0.5625, 'recall': 0.15254237288135594, 'f1-score': 0.24000000000000002, 'support': 59}, 'B-Identifier': {'precision': 0.8840579710144928, 'recall': 0.3388888888888889, 'f1-score': 0.48995983935742976, 'support': 180}, 'B-Observatory': {'precision': 0.8820375335120644, 'recall': 0.744343891402715, 'f1-score': 0.8073619631901842, 'support': 1326}, 'B-Organization': {'precision': 0.9204283855469922, 'recall': 0.8067374331081674, 'f1-score': 0.8598410472183263, 'support': 11399}, 'I-CelestialObject': {'precision': 0.8177037686240141, 'recall': 0.44030202925908446, 'f1-score': 0.5723926380368098, 'support': 2119}, 'I-CelestialRegion': {'precision': 0.4205607476635514, 'recall': 0.11278195488721804, 

In [22]:
# Half
tagger = pycrfsuite.Tagger()
tagger.open(r'biased/ner-model-biased-domain-half.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_half]
y_test_flat = [item for sublist in y_test_half for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-half-half.csv')
print(result)

{'B-CelestialObject': {'precision': 0.2127659574468085, 'recall': 0.46511627906976744, 'f1-score': 0.291970802919708, 'support': 43}, 'B-CelestialRegion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}, 'B-Event': {'precision': 0.6363636363636364, 'recall': 0.17073170731707318, 'f1-score': 0.2692307692307692, 'support': 41}, 'B-Identifier': {'precision': 0.8611111111111112, 'recall': 0.3875, 'f1-score': 0.5344827586206897, 'support': 80}, 'B-Observatory': {'precision': 0.869172932330827, 'recall': 0.706601466992665, 'f1-score': 0.7795010114632501, 'support': 818}, 'B-Organization': {'precision': 0.9151743638077285, 'recall': 0.7766224862888482, 'f1-score': 0.8402249830026577, 'support': 8752}, 'I-CelestialObject': {'precision': 0.2571428571428571, 'recall': 0.5625, 'f1-score': 0.3529411764705882, 'support': 32}, 'I-CelestialRegion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12}, 'I-Event': {'precision': 0.875, 'recall': 0.13023255813953488, 'f1-sco

In [23]:
# 25%
tagger = pycrfsuite.Tagger()
tagger.open(r'biased/ner-model-biased-domain-25.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_25]
y_test_flat = [item for sublist in y_test_25 for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-25-25.csv')
print(result)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'B-CelestialObject': {'precision': 0.06521739130434782, 'recall': 0.17647058823529413, 'f1-score': 0.09523809523809525, 'support': 17}, 'B-CelestialRegion': {'precision': 0.75, 'recall': 0.75, 'f1-score': 0.75, 'support': 4}, 'B-Event': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 18}, 'B-Identifier': {'precision': 0.8571428571428571, 'recall': 0.3157894736842105, 'f1-score': 0.46153846153846156, 'support': 38}, 'B-Observatory': {'precision': 0.8357771260997068, 'recall': 0.6834532374100719, 'f1-score': 0.7519788918205806, 'support': 417}, 'B-Organization': {'precision': 0.9082754629629629, 'recall': 0.6856705985146352, 'f1-score': 0.7814289270599951, 'support': 4578}, 'I-CelestialObject': {'precision': 0.08571428571428572, 'recall': 0.21428571428571427, 'f1-score': 0.12244897959183673, 'support': 14}, 'I-CelestialRegion': {'precision': 0.6, 'recall': 0.375, 'f1-score': 0.4615384615384615, 'support': 8}, 'I-Event': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 's

  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# 10%
tagger = pycrfsuite.Tagger()
tagger.open(r'biased/ner-model-biased-domain-10.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_10]
y_test_flat = [item for sublist in y_test_10 for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-10-10.csv')
print(result)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'B-CelestialObject': {'precision': 0.08, 'recall': 0.2857142857142857, 'f1-score': 0.125, 'support': 7}, 'B-CelestialRegion': {'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 2}, 'B-Event': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 8}, 'B-Identifier': {'precision': 1.0, 'recall': 0.5714285714285714, 'f1-score': 0.7272727272727273, 'support': 7}, 'B-Observatory': {'precision': 0.7849462365591398, 'recall': 0.5104895104895105, 'f1-score': 0.6186440677966102, 'support': 143}, 'B-Organization': {'precision': 0.8451443569553806, 'recall': 0.637203166226913, 'f1-score': 0.7265889432117337, 'support': 1516}, 'I-CelestialObject': {'precision': 0.13333333333333333, 'recall': 0.2857142857142857, 'f1-score': 0.18181818181818182, 'support': 7}, 'I-CelestialRegion': {'precision': 1.0, 'recall': 0.16666666666666666, 'f1-score': 0.2857142857142857, 'support': 6}, 'I-Event': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 31}, 'I-Identifie

  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
import pandas
print("###### 100% #####")
tagger = pycrfsuite.Tagger()
tagger.open(r'biased/ner-model-biased-domain-full.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_full]
y_test_flat = [item for sublist in y_test_full for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-full.csv')
print(result)

print("###### 50% #####")
tagger.open(r'biased/ner-model-biased-domain-half.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_full]
y_test_flat = [item for sublist in y_test_full for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-half.csv')
print(result)

print("###### 25% #####")
tagger.open(r'biased/ner-model-biased-domain-25.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_full]
y_test_flat = [item for sublist in y_test_full for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-25.csv')
print(result)

print("###### 10% #####")
tagger.open(r'biased/ner-model-biased-domain-10.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test_full]
y_test_flat = [item for sublist in y_test_full for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('biased-10.csv')
print(result)

###### 100% #####
{'B-CelestialObject': {'precision': 0.7839276210750399, 'recall': 0.4081463009143807, 'f1-score': 0.5368075801749271, 'support': 3609}, 'B-CelestialRegion': {'precision': 0.2835820895522388, 'recall': 0.09090909090909091, 'f1-score': 0.13768115942028988, 'support': 209}, 'B-Event': {'precision': 0.5625, 'recall': 0.15254237288135594, 'f1-score': 0.24000000000000002, 'support': 59}, 'B-Identifier': {'precision': 0.8840579710144928, 'recall': 0.3388888888888889, 'f1-score': 0.48995983935742976, 'support': 180}, 'B-Observatory': {'precision': 0.8820375335120644, 'recall': 0.744343891402715, 'f1-score': 0.8073619631901842, 'support': 1326}, 'B-Organization': {'precision': 0.9204283855469922, 'recall': 0.8067374331081674, 'f1-score': 0.8598410472183263, 'support': 11399}, 'I-CelestialObject': {'precision': 0.8177037686240141, 'recall': 0.44030202925908446, 'f1-score': 0.5723926380368098, 'support': 2119}, 'I-CelestialRegion': {'precision': 0.4205607476635514, 'recall': 0.1