In [6]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
import json
import re

from src.preprocess import *
import spacy

In [5]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
dk_data = dataset['validation']
test_data = dataset['test']

nlp = spacy.load("en_core_web_sm")

# Data Preprocess

In [13]:
with open('unbiased_domain_knowledge.json') as json_file:
    unbiased_dk = json.load(json_file)

In [10]:
def count_subtokens(token, subtoken_list):
    count = 0
    for subtoken in subtoken_list:
        if subtoken in token:
            count += 1
    return count

def add_spacy_ner(sentence):
    doc = nlp(" ".join(token for token, _ in sentence))
    spacy_features = [{} for _ in sentence]  # Initialize empty feature dict for each token

    # Match spaCy tokens to the original tokens and assign NER tags
    spacy_index = 0
    for i, (token, _) in enumerate(sentence):
        while spacy_index < len(doc) and doc[spacy_index].idx < len(" ".join(sentence[i][0] for i in range(0, i + 1))):
            if doc[spacy_index].ent_type_:
                spacy_features[i]['spacy_ner_' + doc[spacy_index].ent_type_.lower()] = 1
            spacy_index += 1

    return spacy_features

def search_regex(word, pattern):
    return bool(re.search(pattern, word))


In [11]:
def preprocess_data(dataset):
    processed_tags, _, __ = process_entity_tag(data=dataset)
    formatted_data = []
    for item in dataset:
        tokens = item['tokens']
        sentence = list(zip(tokens, processed_tags))
        formatted_data.append(sentence)
    return formatted_data

def word2features(sent, i, spacy_features, dk=None):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    features.update(spacy_features[i])

    # Domain Knowledge features
    if dk:
        for k, v in dk['sub_tokens'].items():
            features[k] = count_subtokens(word, v)
        for k, v in dk['regex'].items():
            features[k] = search_regex(word, v)

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"-1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
        # Domain Knowledge features
        if dk:
            for k, v in dk['sub_tokens'].items():
                features[f"+1:{k}"] = count_subtokens(word1, v)
            for k, v in dk['regex'].items():
                features[k] = search_regex(word, v)
    else:
        features['EOS'] = True

    return features

def process_sentence(sentence):
    spacy_features = add_spacy_ner(sentence)
    return [word2features(sentence, i, spacy_features) for i in range(len(sentence))], [label for token, label in sentence]

# generate features for dataset
def prepare_data(sentences, dk):
    X = []
    y = []
    for s in sentences:
        spacy_features = add_spacy_ner(s)
        X.append([word2features(s, i, spacy_features, dk) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

In [14]:
# Data preprocess
X_train, y_train = prepare_data(preprocess_data(train_data), unbiased_dk)
X_test, y_test = prepare_data(preprocess_data(test_data), unbiased_dk)

In [15]:
# split different size
X_train_75, X_train_25, y_train_75, y_train_25 = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
X_train_50_1, X_train_50_2, y_train_50_1, y_train_50_2 = train_test_split(X_train, y_train, test_size=0.5, random_state=42)
X_train_90, X_train_10, y_train_90, y_train_10 = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

# All Data

## Tarining

In [16]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model.crfsuite')

CPU times: user 15min 52s, sys: 49.4 s, total: 16min 41s
Wall time: 17min 34s


## Testing

In [17]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat)
print(result)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.93      0.74      0.82       359
        B-CelestialObject       0.69      0.55      0.61      3609
  B-CelestialObjectRegion       0.63      0.08      0.14       723
        B-CelestialRegion       0.29      0.14      0.19       209
               B-Citation       0.96      0.94      0.95      8621
          B-Collaboration       0.87      0.79      0.83       428
      B-ComputingFacility       0.80      0.44      0.57       607
               B-Database       0.93      0.68      0.79       342
                B-Dataset       0.54      0.16      0.25       516
 B-EntityOfFutureInterest       0.57      0.02      0.04       435
                  B-Event       0.59      0.41      0.48        59
             B-Fellowship       0.68      0.55      0.61       607
                B-Formula       0.82      0.69      0.75      3452
                  B-Grant       0.47      0.42      0.44     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Half Data

## Tarining

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_50_1, y_train_50_1):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model.crfsuite')

## Testing

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result_2 = classification_report(y_test_flat, y_test_pred_flat)
print(result_2)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.90      0.74      0.81       359
        B-CelestialObject       0.68      0.48      0.57      3609
  B-CelestialObjectRegion       0.64      0.09      0.16       723
        B-CelestialRegion       0.26      0.09      0.14       209
               B-Citation       0.95      0.93      0.94      8621
          B-Collaboration       0.89      0.78      0.83       428
      B-ComputingFacility       0.79      0.44      0.56       607
               B-Database       0.94      0.67      0.78       342
                B-Dataset       0.53      0.17      0.25       516
 B-EntityOfFutureInterest       0.00      0.00      0.00       435
                  B-Event       0.33      0.22      0.27        59
             B-Fellowship       0.73      0.55      0.63       607
                B-Formula       0.81      0.65      0.72      3452
                  B-Grant       0.44      0.37      0.41     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 1/4 Data

## Tarining

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_25, y_train_25):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model.crfsuite')

## Testing

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result_3 = classification_report(y_test_flat, y_test_pred_flat)
print(result_3)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.90      0.74      0.81       359
        B-CelestialObject       0.68      0.48      0.57      3609
  B-CelestialObjectRegion       0.64      0.09      0.16       723
        B-CelestialRegion       0.26      0.09      0.14       209
               B-Citation       0.95      0.93      0.94      8621
          B-Collaboration       0.89      0.78      0.83       428
      B-ComputingFacility       0.79      0.44      0.56       607
               B-Database       0.94      0.67      0.78       342
                B-Dataset       0.53      0.17      0.25       516
 B-EntityOfFutureInterest       0.00      0.00      0.00       435
                  B-Event       0.33      0.22      0.27        59
             B-Fellowship       0.73      0.55      0.63       607
                B-Formula       0.81      0.65      0.72      3452
                  B-Grant       0.44      0.37      0.41     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 1/10 Data

## Tarining

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_50_1, y_train_50_1):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model.crfsuite')

## Testing

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result_4 = classification_report(y_test_flat, y_test_pred_flat)
print(result_4)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.90      0.74      0.81       359
        B-CelestialObject       0.68      0.48      0.57      3609
  B-CelestialObjectRegion       0.64      0.09      0.16       723
        B-CelestialRegion       0.26      0.09      0.14       209
               B-Citation       0.95      0.93      0.94      8621
          B-Collaboration       0.89      0.78      0.83       428
      B-ComputingFacility       0.79      0.44      0.56       607
               B-Database       0.94      0.67      0.78       342
                B-Dataset       0.53      0.17      0.25       516
 B-EntityOfFutureInterest       0.00      0.00      0.00       435
                  B-Event       0.33      0.22      0.27        59
             B-Fellowship       0.73      0.55      0.63       607
                B-Formula       0.81      0.65      0.72      3452
                  B-Grant       0.44      0.37      0.41     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
