In [4]:
import datasets
from sklearn.model_selection import train_test_split
import pycrfsuite
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, matthews_corrcoef

import spacy
import multiprocessing

In [5]:
dataset = datasets.load_dataset("adsabs/WIESP2022-NER")
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

nlp = spacy.load("en_core_web_sm")

In [6]:
formatted_data = []
for item in dataset['train']:
    tokens = item['tokens']
    ner_tags = item['ner_tags']
    sentence = list(zip(tokens, ner_tags))
    formatted_data.append(sentence)
print(len(formatted_data))

1753


In [7]:
def add_spacy_ner(sentence):
    doc = nlp(" ".join(token for token, _ in sentence))
    spacy_features = [{} for _ in sentence]  # Initialize empty feature dict for each token

    # Match spaCy tokens to the original tokens and assign NER tags
    spacy_index = 0
    for i, (token, _) in enumerate(sentence):
        while spacy_index < len(doc) and doc[spacy_index].idx < len(" ".join(sentence[i][0] for i in range(0, i + 1))):
            if doc[spacy_index].ent_type_:
                spacy_features[i]['spacy_ner_' + doc[spacy_index].ent_type_.lower()] = 1
            spacy_index += 1

    return spacy_features

def word2features(sent, i, spacy_features):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    features.update(spacy_features[i])

    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def prepare_data(sentences):
    X = []
    y = []
    for s in sentences:
        spacy_features = add_spacy_ner(s)
        X.append([word2features(s, i, spacy_features) for i in range(len(s))])
        y.append([label for token, label in s])
    return X, y

def preprocess_data(dataset):
    formatted_data = []
    for item in dataset:
        tokens = item['tokens']
        ner_tags = item['ner_tags']
        sentence = list(zip(tokens, ner_tags))
        formatted_data.append(sentence)
    return formatted_data


def process_sentence(sentence):
    spacy_features = add_spacy_ner(sentence)
    return [word2features(sentence, i, spacy_features) for i in range(len(sentence))], [label for token, label in sentence]

def preprocess_data_new(dataset):
    formatted_data = [(item['tokens'], item['ner_tags']) for item in dataset]  # Load data

    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        results = pool.map(process_sentence, formatted_data)

    X, y = zip(*results)
    return X, y

In [10]:
X_train, y_train = prepare_data(preprocess_data(train_data))
X_valid, y_valid = prepare_data(preprocess_data(validation_data))
X_test, y_test = prepare_data(preprocess_data(test_data))

X_train_75, X_train_25, y_train_75, y_train_25 = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
X_train_50_1, X_train_50_2, y_train_50_1, y_train_50_2 = train_test_split(X_train, y_train, test_size=0.5, random_state=42)
X_train_90, X_train_10, y_train_90, y_train_10 = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

In [11]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-full-bin.crfsuite')

CPU times: user 3min 4s, sys: 2.44 s, total: 3min 6s
Wall time: 3min 7s


In [12]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_50_1, y_train_50_1):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-50-bin.crfsuite')

CPU times: user 1min 30s, sys: 1.03 s, total: 1min 31s
Wall time: 1min 31s


In [13]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_25, y_train_25):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-25-bin.crfsuite')

CPU times: user 48 s, sys: 738 ms, total: 48.7 s
Wall time: 48.7 s


In [14]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train_10, y_train_10):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,
    'feature.possible_transitions': True
})
trainer.train('ner-model-10-bin.crfsuite')

CPU times: user 15.7 s, sys: 232 ms, total: 15.9 s
Wall time: 15.9 s


# validation

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_pred = [tagger.tag(xseq) for xseq in X_valid]
y_valid_flat = [item for sublist in y_valid for item in sublist]
y_pred_flat = [item for sublist in y_pred for item in sublist]

result = classification_report(y_valid_flat, y_pred_flat)
print(result)

# test

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('ner-model.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

result = classification_report(y_test_flat, y_test_pred_flat)
print(result)

## MCC and F1

In [15]:
tagger = pycrfsuite.Tagger()

# Full
print("Full Dataset")
tagger.open('ner-model-full-bin.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

# 50%
print("50% Dataset")
tagger.open('ner-model-50-bin.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

# 25%
print("25% Dataset")
tagger.open('ner-model-25-bin.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

# 10%
print("10% Dataset")
tagger.open('ner-model-10-bin.crfsuite')
y_test_pred = [tagger.tag(xseq) for xseq in X_test]
y_test_flat = [item for sublist in y_test for item in sublist]
y_test_pred_flat = [item for sublist in y_test_pred for item in sublist]

# result = classification_report(y_test_flat, y_test_pred_flat)
f1 = f1_score(y_test_flat, y_test_pred_flat, average='macro')
mcc = matthews_corrcoef(y_test_flat, y_test_pred_flat)
print(f"f1_score: {f1:.4f}, mcc: {mcc:.4f}")

Full Dataset
f1_score: 0.4846, mcc: 0.8069
50% Dataset
f1_score: 0.4685, mcc: 0.7931
25% Dataset
f1_score: 0.4061, mcc: 0.7732
10% Dataset
f1_score: 0.3306, mcc: 0.7255
