In [1]:
!pip install -q datasets python-crfsuite scikit-learn

In [4]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report

In [3]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

In [6]:
def prepare_data(sentences):
    X = []
    y = []
    for s in sentences:
        X.append([word2features(s, i) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

In [7]:
def preprocess_data(dataset):
    formatted_data = []
    for item in dataset:
        tokens = item['tokens']
        ner_tags = item['ner_tags']
        sentence = list(zip(tokens, ner_tags))
        formatted_data.append(sentence)
    return formatted_data

In [8]:
X_train, y_train = prepare_data(preprocess_data(train_data))
X_valid, y_valid = prepare_data(preprocess_data(validation_data))
X_test, y_test = prepare_data(preprocess_data(test_data))
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Tarining

In [9]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model.crfsuite')

CPU times: user 3min 5s, sys: 2.4 s, total: 3min 8s
Wall time: 3min 8s


## Hand picked tags:

* Organization
* Observatory

* Celestrial
* Event

* Celestial Region
* Identifier

# Validation


In [10]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_pred = [tagger.tag(xseq) for xseq in X_valid]
y_valid_flat = [item for sublist in y_valid for item in sublist]
y_pred_flat = [item for sublist in y_pred for item in sublist]

result = classification_report(y_valid_flat, y_pred_flat)
print(result)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.83      0.69      0.76       153
        B-CelestialObject       0.77      0.48      0.59      2285
  B-CelestialObjectRegion       0.56      0.19      0.29       150
        B-CelestialRegion       0.48      0.13      0.20       102
               B-Citation       0.96      0.93      0.94      4820
          B-Collaboration       0.89      0.69      0.78       238
      B-ComputingFacility       0.88      0.48      0.62       360
               B-Database       0.89      0.69      0.78       199
                B-Dataset       0.46      0.14      0.22       222
 B-EntityOfFutureInterest       0.33      0.02      0.04        52
                  B-Event       0.40      0.05      0.10        37
             B-Fellowship       0.79      0.60      0.68       326
                B-Formula       0.77      0.69      0.73      1541
                  B-Grant       0.72      0.61      0.66     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Testing

In [11]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat)
print(result)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.91      0.74      0.82       359
        B-CelestialObject       0.72      0.47      0.57      3609
  B-CelestialObjectRegion       0.68      0.10      0.17       723
        B-CelestialRegion       0.40      0.09      0.15       209
               B-Citation       0.96      0.93      0.94      8621
          B-Collaboration       0.88      0.77      0.82       428
      B-ComputingFacility       0.86      0.42      0.56       607
               B-Database       0.91      0.68      0.78       342
                B-Dataset       0.53      0.13      0.21       516
 B-EntityOfFutureInterest       0.17      0.00      0.01       435
                  B-Event       0.83      0.34      0.48        59
             B-Fellowship       0.71      0.57      0.63       607
                B-Formula       0.83      0.66      0.74      3452
                  B-Grant       0.46      0.39      0.42     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
