In [1]:
!pip install -q datasets python-crfsuite scikit-learn

In [1]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, matthews_corrcoef

In [2]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

In [4]:
def prepare_data(sentences):
    X = []
    y = []
    for s in sentences:
        X.append([word2features(s, i) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

In [5]:
def preprocess_data(dataset):
    formatted_data = []
    for item in dataset:
        tokens = item['tokens']
        ner_tags = item['ner_tags']
        sentence = list(zip(tokens, ner_tags))
        formatted_data.append(sentence)
    return formatted_data

In [6]:
X_train, y_train = prepare_data(preprocess_data(train_data))
X_valid, y_valid = prepare_data(preprocess_data(validation_data))
X_test, y_test = prepare_data(preprocess_data(test_data))

In [7]:
X_train_75, X_train_25, y_train_75, y_train_25 = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
X_train_50_1, X_train_50_2, y_train_50_1, y_train_50_2 = train_test_split(X_train, y_train, test_size=0.5, random_state=42)
X_train_90, X_train_10, y_train_90, y_train_10 = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

# Tarining

## Full data set

In [9]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-full.crfsuite')

CPU times: user 3min 5s, sys: 2.4 s, total: 3min 8s
Wall time: 3min 8s


## 50% Dataset

In [10]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_50_1, y_train_50_1):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-50.crfsuite')

CPU times: user 1min 28s, sys: 1.2 s, total: 1min 29s
Wall time: 1min 29s


## 25% Dataset

In [11]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_25, y_train_25):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-25.crfsuite')

CPU times: user 44.6 s, sys: 511 ms, total: 45.1 s
Wall time: 45.1 s


## 10% Dataset

In [12]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_10, y_train_10):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-10.crfsuite')

CPU times: user 15.4 s, sys: 195 ms, total: 15.6 s
Wall time: 15.6 s


## Hand picked tags:

* Organization
* Observatory

* Celestrial
* Event

* Celestial Region
* Identifier

# Validation


In [10]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_pred = [tagger.tag(xseq) for xseq in X_valid]
y_valid_flat = [item for sublist in y_valid for item in sublist]
y_pred_flat = [item for sublist in y_pred for item in sublist]

result = classification_report(y_valid_flat, y_pred_flat)
print(result)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.83      0.69      0.76       153
        B-CelestialObject       0.77      0.48      0.59      2285
  B-CelestialObjectRegion       0.56      0.19      0.29       150
        B-CelestialRegion       0.48      0.13      0.20       102
               B-Citation       0.96      0.93      0.94      4820
          B-Collaboration       0.89      0.69      0.78       238
      B-ComputingFacility       0.88      0.48      0.62       360
               B-Database       0.89      0.69      0.78       199
                B-Dataset       0.46      0.14      0.22       222
 B-EntityOfFutureInterest       0.33      0.02      0.04        52
                  B-Event       0.40      0.05      0.10        37
             B-Fellowship       0.79      0.60      0.68       326
                B-Formula       0.77      0.69      0.73      1541
                  B-Grant       0.72      0.61      0.66     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Testing

In [11]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat)
print(result)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.91      0.74      0.82       359
        B-CelestialObject       0.72      0.47      0.57      3609
  B-CelestialObjectRegion       0.68      0.10      0.17       723
        B-CelestialRegion       0.40      0.09      0.15       209
               B-Citation       0.96      0.93      0.94      8621
          B-Collaboration       0.88      0.77      0.82       428
      B-ComputingFacility       0.86      0.42      0.56       607
               B-Database       0.91      0.68      0.78       342
                B-Dataset       0.53      0.13      0.21       516
 B-EntityOfFutureInterest       0.17      0.00      0.01       435
                  B-Event       0.83      0.34      0.48        59
             B-Fellowship       0.71      0.57      0.63       607
                B-Formula       0.83      0.66      0.74      3452
                  B-Grant       0.46      0.39      0.42     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
tagger = pycrfsuite.Tagger()

# Full
print("Full Dataset")
tagger.open('ner-model-full.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

# 50%
print("50% Dataset")
tagger.open('ner-model-50.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

# 25%
print("25% Dataset")
tagger.open('ner-model-25.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

# 10%
print("10% Dataset")
tagger.open('ner-model-10.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

Full Dataset
f1_score: 0.4944, mcc: 0.8098
50% Dataset
f1_score: 0.4652, mcc: 0.7925
25% Dataset
f1_score: 0.3969, mcc: 0.7627
10% Dataset
f1_score: 0.3236, mcc: 0.7184


In [8]:
import pandas
tagger = pycrfsuite.Tagger()

# Full
print("Full Dataset")
tagger.open('ner-model-full.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]
result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('pure-full.csv')
print(result)

# 50%
print("50% Dataset")
tagger.open('ner-model-50.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]
result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('pure-half.csv')
print(result)

# 25%
print("25% Dataset")
tagger.open('ner-model-25.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]
result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('pure-25.csv')
print(result)

# 10%
print("10% Dataset")
tagger.open('ner-model-10.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]
result = classification_report(y_test_flat, y_test_pred_flat, output_dict=True)
df = pandas.DataFrame(result).transpose()
df.to_csv('pure-10.csv')
print(result)

Full Dataset


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'B-Archive': {'precision': 0.9140893470790378, 'recall': 0.7409470752089137, 'f1-score': 0.8184615384615385, 'support': 359.0}, 'B-CelestialObject': {'precision': 0.7187366623986342, 'recall': 0.46661124965364364, 'f1-score': 0.5658602150537635, 'support': 3609.0}, 'B-CelestialObjectRegion': {'precision': 0.6796116504854369, 'recall': 0.09681881051175657, 'f1-score': 0.1694915254237288, 'support': 723.0}, 'B-CelestialRegion': {'precision': 0.3958333333333333, 'recall': 0.09090909090909091, 'f1-score': 0.14785992217898833, 'support': 209.0}, 'B-Citation': {'precision': 0.9593204115817181, 'recall': 0.9300545180373506, 'f1-score': 0.9444608045232346, 'support': 8621.0}, 'B-Collaboration': {'precision': 0.8793565683646113, 'recall': 0.7663551401869159, 'f1-score': 0.818976279650437, 'support': 428.0}, 'B-ComputingFacility': {'precision': 0.8605442176870748, 'recall': 0.41680395387149916, 'f1-score': 0.5615982241953386, 'support': 607.0}, 'B-Database': {'precision': 0.9140625, 'recall': 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'B-Archive': {'precision': 0.8958333333333334, 'recall': 0.7186629526462396, 'f1-score': 0.7975270479134466, 'support': 359.0}, 'B-CelestialObject': {'precision': 0.6963711529627928, 'recall': 0.4200609587143253, 'f1-score': 0.5240235050120982, 'support': 3609.0}, 'B-CelestialObjectRegion': {'precision': 0.6015625, 'recall': 0.10650069156293222, 'f1-score': 0.1809635722679201, 'support': 723.0}, 'B-CelestialRegion': {'precision': 0.28, 'recall': 0.06698564593301436, 'f1-score': 0.10810810810810811, 'support': 209.0}, 'B-Citation': {'precision': 0.9482903419316137, 'recall': 0.916830994084213, 'f1-score': 0.9322953526775183, 'support': 8621.0}, 'B-Collaboration': {'precision': 0.9178885630498533, 'recall': 0.7313084112149533, 'f1-score': 0.8140442132639792, 'support': 428.0}, 'B-ComputingFacility': {'precision': 0.7617554858934169, 'recall': 0.400329489291598, 'f1-score': 0.5248380129589633, 'support': 607.0}, 'B-Database': {'precision': 0.9297520661157025, 'recall': 0.6578947368421053

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'B-Archive': {'precision': 0.9567901234567902, 'recall': 0.43175487465181056, 'f1-score': 0.5950095969289827, 'support': 359.0}, 'B-CelestialObject': {'precision': 0.6216617210682492, 'recall': 0.3482959268495428, 'f1-score': 0.44645711241342567, 'support': 3609.0}, 'B-CelestialObjectRegion': {'precision': 0.35294117647058826, 'recall': 0.016597510373443983, 'f1-score': 0.031704095112285335, 'support': 723.0}, 'B-CelestialRegion': {'precision': 0.125, 'recall': 0.004784688995215311, 'f1-score': 0.009216589861751152, 'support': 209.0}, 'B-Citation': {'precision': 0.9407594328286469, 'recall': 0.9081313072729382, 'f1-score': 0.9241574691613056, 'support': 8621.0}, 'B-Collaboration': {'precision': 0.8698412698412699, 'recall': 0.6401869158878505, 'f1-score': 0.7375504710632571, 'support': 428.0}, 'B-ComputingFacility': {'precision': 0.6516516516516516, 'recall': 0.357495881383855, 'f1-score': 0.46170212765957447, 'support': 607.0}, 'B-Database': {'precision': 0.9128630705394191, 'recall'

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'B-Archive': {'precision': 0.9444444444444444, 'recall': 0.1894150417827298, 'f1-score': 0.31554524361948955, 'support': 359.0}, 'B-CelestialObject': {'precision': 0.5211195928753181, 'recall': 0.2837351066777501, 'f1-score': 0.3674201650520273, 'support': 3609.0}, 'B-CelestialObjectRegion': {'precision': 0.46153846153846156, 'recall': 0.024896265560165973, 'f1-score': 0.047244094488188976, 'support': 723.0}, 'B-CelestialRegion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 209.0}, 'B-Citation': {'precision': 0.9186539159908002, 'recall': 0.8802923094768589, 'f1-score': 0.8990640919322355, 'support': 8621.0}, 'B-Collaboration': {'precision': 0.7870370370370371, 'recall': 0.5957943925233645, 'f1-score': 0.6781914893617021, 'support': 428.0}, 'B-ComputingFacility': {'precision': 0.6408450704225352, 'recall': 0.14991762767710048, 'f1-score': 0.24299065420560748, 'support': 607.0}, 'B-Database': {'precision': 0.9113300492610837, 'recall': 0.5409356725146199, 'f1-score': 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
