In [1]:
!pip install -q datasets scikit-learn python-crfsuite spacy
!python -m spacy download en_core_web_sm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31

In [1]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
import json
import re

# from src.preprocess import *
import spacy

In [2]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
dk_data = dataset['validation']
test_data = dataset['test']

nlp = spacy.load("en_core_web_sm")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Data Preprocess

In [32]:
from google.colab import files
uploaded = files. upload()

Saving biased_domain_knowledge.json to biased_domain_knowledge.json


In [3]:
with open('biased_domain_knowledge.json') as json_file:
    unbiased_dk = json.load(json_file)

In [4]:
dk_200 = unbiased_dk['200']
dk_500 = unbiased_dk['500']
dk_1000 = unbiased_dk['1000']
dk_all = unbiased_dk['all']

In [5]:
def count_subtokens(token, subtoken_list):
    count = 0
    for subtoken in subtoken_list:
        if subtoken in token:
            count += 1
    return count

def add_spacy_ner(sentence):
    doc = nlp(" ".join(token for token, _ in sentence))
    spacy_features = [{} for _ in sentence]  # Initialize empty feature dict for each token

    # Match spaCy tokens to the original tokens and assign NER tags
    spacy_index = 0
    for i, (token, _) in enumerate(sentence):
        while spacy_index < len(doc) and doc[spacy_index].idx < len(" ".join(sentence[i][0] for i in range(0, i + 1))):
            if doc[spacy_index].ent_type_:
                spacy_features[i]['spacy_ner_' + doc[spacy_index].ent_type_.lower()] = 1
            spacy_index += 1

    return spacy_features

def search_regex(word, pattern):
    return bool(re.search(pattern, word))


In [6]:
from collections import Counter

entity_name = [
    'Organization',
    'Observatory',
    'CelestialObject',
    'Event',
    'CelestialRegion',
    'Identifier'
]

ner_tags = ["B-"+entity for entity in entity_name] + ["I-"+entity for entity in entity_name]

tag_to_id = {"O" : len(ner_tags)}
for i in range(len(ner_tags)):
    tag_to_id[ner_tags[i]] = i

def process_entity_tag(data, ner_tags=ner_tags):
    """
    Process ner tags based on selected entities

    Args:
        data: (hugging face dateset).
        ner_tags (list): ner tags.

    Returns:
        processed_tags (List[List[str]]): list of processed ner tags, each element is a list of ner tags of a document
        ner_tokens (dic): key is ner tag, value is a list containing all tokens labeled as the tag
        text (str): original text
    """
    # create new ner tags
    processed_tags = []
    text = []
    ner_tokens = {}
    for n in ner_tags:
        ner_tokens[n] = []

    for n in range(len(data)):
        doc = " ".join(data[n]['tokens'])
        ner_copy = data[n]['ner_tags'].copy()
        for i, t in enumerate(ner_copy):
            # taget ner
            if t in ner_tags:
                ner_tokens[t].append(data[n]['tokens'][i])
            # redundant ner
            elif t != "O":
                ner_copy[i] = "O"

        processed_tags.append(ner_copy)
        text.append(doc)

    return (processed_tags, ner_tokens, text)

def find_frequent_subword(tokens, n_gram, top):
    subwords = []
    for t in tokens:
        if len(t)>=n_gram:
            subwords.extend([t[i:i+n_gram] for i in range(len(t)-n_gram+1)])
    counts = Counter(subwords)
    top_subwords = counts.most_common(top)
    return top_subwords

In [7]:
def preprocess_data(dataset, sample=None):
    processed_tags, _, __ = process_entity_tag(data=dataset)
    formatted_data = []
    if not sample:
        sample = len(dataset)
    for i in range(sample):
        item = dataset[i]
        tokens = item['tokens']
        sentence = list(zip(tokens, processed_tags[i]))
        formatted_data.append(sentence)
    return formatted_data

def word2features(sent, i, spacy_features, dk=None):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    features.update(spacy_features[i])

    # Domain Knowledge features
    if dk:
        for k, v in dk['sub_tokens'].items():
            features[k] = count_subtokens(word, v)
        for k, v in dk['regex'].items():
            features[k] = search_regex(word, v)

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"-1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"+1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['EOS'] = True

    return features

def process_sentence(sentence):
    spacy_features = add_spacy_ner(sentence)
    return [word2features(sentence, i, spacy_features) for i in range(len(sentence))], [label for token, label in sentence]

# generate features for dataset
def prepare_data(sentences, dk):
    X = []
    y = []
    for s in sentences:
        spacy_features = add_spacy_ner(s)
        X.append([word2features(s, i, spacy_features, dk) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

# All Data

In [35]:
# Data preprocess
X_train, y_train = prepare_data(preprocess_data(train_data), dk_all)
# X_test, y_test = prepare_data(preprocess_data(test_data), dk_all)

## Tarining

In [36]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-biased-domain-full.crfsuite')

CPU times: user 5min 21s, sys: 2.89 s, total: 5min 24s
Wall time: 5min 24s


# Half Data

In [8]:
# Data preprocess
X_train_50_1, y_train_50_1 = prepare_data(preprocess_data(train_data, sample=1000), dk_1000)
# X_test, y_test = prepare_data(preprocess_data(test_data, sample=1000), dk_1000)

## Tarining

In [9]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_50_1, y_train_50_1):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-biased-domain-half.crfsuite')

CPU times: user 3min, sys: 1.53 s, total: 3min 1s
Wall time: 3min 1s


# 1/4 Data

In [10]:
# Data preprocess
X_train_25, y_train_25 = prepare_data(preprocess_data(train_data, sample=500), dk_500)
# X_test, y_test = prepare_data(preprocess_data(test_data, sample=500), dk_500)

## Tarining

In [11]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_25, y_train_25):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-biased-domain-25.crfsuite')

CPU times: user 1min 29s, sys: 351 ms, total: 1min 29s
Wall time: 1min 29s


# 1/10 Data

In [12]:
# Data preprocess
X_train_10, y_train_10 = prepare_data(preprocess_data(train_data, sample=200), dk_200)
# X_test, y_test = prepare_data(preprocess_data(test_data, sample=200), dk_200)

## Tarining

In [13]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_10, y_train_10):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-biased-domain-10.crfsuite')

CPU times: user 31.8 s, sys: 111 ms, total: 31.9 s
Wall time: 32 s
