In [None]:
import numpy as np
import pandas as pd
from transformers import BertModel
import torch
from torch.nn import CrossEntropyLoss

In [None]:
import sys
sys.path.append("..\\parser")
import conll04_parser

In [None]:
from ipywidgets import IntProgress
from IPython.display import display

In [None]:
# train_raw = conll04_parser.extract_data("train")

In [None]:
# dev_raw = conll04_parser.extract_data("dev")

In [None]:
# test_raw = conll04_parser.extract_data("test")

In [None]:
# bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def transform_data(
    raw_data, 
    pretrain_model, 
    ignore_index=CrossEntropyLoss().ignore_index,
    max_token_count=512,
    cls_token=conll04_parser.CLS_TOKEN,
    sep_token=conll04_parser.SEP_TOKEN
):
    """Transform the parsed dataset with a pre-trained model
    Only the first token of each word is labeled, the others are masked as 'ignore_index'
    The label of O is 0
    The label of I is the negation of the corresponding label of B
    """
    progress = IntProgress(min=0, max=len(raw_data)) # instantiate the bar
    display(progress) # display the bar
    
    padding_token_count = (1 if cls_token else 0) + (1 if sep_token else 0)
    
    transformed_tokens = []
    true_labels = []
    true_words = []
    
    for document in raw_data:
        progress.value += 1
        ids = document["data_frame"]["ids"].tolist()
        tokens = document["data_frame"]["token_ids"].tolist()
        labels = document["data_frame"]["entity_embedding"].tolist()
        words = document["data_frame"]["words"].tolist()
        
        for i in range(len(tokens)):
            if i > 0 and ids[i] == ids[i-1]:
                # Extra tokens from the same word are ignored
                labels[i] = ignore_index
                    
        # print(list(zip(document["data_frame"]["words"].tolist(), labels)))
        if cls_token: 
            tokens = [cls_token] + tokens
            labels = [ignore_index] + labels
            words = ["[CLS]"] + words
        if sep_token:
            tokens.append(sep_token)
            labels.append(ignore_index)
            words.append("[SEP]")
        outputs = pretrain_model(
            input_ids=torch.tensor([tokens]), 
            token_type_ids=torch.tensor([[0] * len(tokens)]),
            attention_mask=torch.tensor([[1] * len(tokens)])
        )
        transformed_tokens += outputs.last_hidden_state[0].tolist()
        true_labels += labels
        true_words += words
            
    assert len(transformed_tokens) == len(true_labels) == len(true_words)
    return pd.DataFrame(transformed_tokens), pd.DataFrame(list(zip(true_labels, true_words)))

In [None]:
# train_tokens, train_labels = transform_data(train_raw, bert_model)
# print("Saving train tokens of shape", train_tokens.shape)
# train_tokens.to_csv("train_tokens.csv", index=False)
# print("Saving train labels of shape", train_labels.shape)
# train_labels.to_csv("train_labels.csv", index=False)

In [None]:
# dev_tokens, dev_labels = transform_data(dev_raw, bert_model)
# print("Saving dev tokens of shape", dev_tokens.shape)
# dev_tokens.to_csv("dev_tokens.csv", index=False)
# print("Saving dev labels of shape", dev_labels.shape)
# dev_labels.to_csv("dev_labels.csv", index=False)

In [None]:
# test_tokens, test_labels = transform_data(test_raw, bert_model)
# print("Saving test tokens of shape", test_tokens.shape)
# test_tokens.to_csv("test_tokens.csv", index=False)
# print("Saving test labels of shape", test_labels.shape)
# test_labels.to_csv("test_labels.csv", index=False)

In [None]:
train_tokens = pd.read_csv("conll04\\train_tokens.csv")
train_labels = pd.read_csv("conll04\\train_labels.csv")
dev_tokens = pd.read_csv("conll04\\dev_tokens.csv")
dev_labels = pd.read_csv("conll04\\dev_labels.csv")
test_tokens = pd.read_csv("conll04\\test_tokens.csv")
test_labels = pd.read_csv("conll04\\test_labels.csv")

In [None]:
print(train_tokens.shape, train_labels.shape)
print(dev_tokens.shape, dev_labels.shape)
print(test_tokens.shape, test_labels.shape)

In [None]:
train_tokens = train_tokens[train_labels["0"] != -100]
train_labels = train_labels[train_labels["0"] != -100]

In [None]:
dev_tokens = dev_tokens[dev_labels["0"] != -100]
dev_labels = dev_labels[dev_labels["0"] != -100]

In [None]:
test_tokens = test_tokens[test_labels["0"] != -100]
test_labels = test_labels[test_labels["0"] != -100]

In [None]:
training_tokens = pd.concat([train_tokens, dev_tokens], ignore_index=True)
training_labels = pd.concat([train_labels, dev_labels], ignore_index=True)

In [None]:
print(training_tokens.shape, training_labels.shape, test_tokens.shape, test_labels.shape)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
label_map_bio = {v: k for k, v in conll04_parser.entity_encode.items()}

In [None]:
def run_classifier_bio(clf, x_train, y_train, x_test, y_test):
    print("Fitting...")
    clf.fit(x_train, y_train)
    print("Predicting...")
    y_pred = clf.predict(x_test)
    
    print("Results:")
    precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=clf.classes_)
    result = pd.DataFrame(index=[label_map_bio[label] for label in clf.classes_])
    result["precision"] = precision
    result["recall"] = recall
    result["fbeta_score"] = fbeta_score
    result["support"] = support
    print(result)
    
    return clf, result

In [None]:
kneighbor_clf, kneighbor_result = run_classifier_bio(
    KNeighborsClassifier(), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
         precision    recall  fbeta_score  support
O         0.956005  0.983709     0.969660     6384
B-Loc     0.787686  0.868852     0.826281      427
I-Loc     0.837838  0.604878     0.702550      205
B-Peop    0.874172  0.822430     0.847512      321
I-Peop    0.850515  0.894309     0.871863      369
B-Org     0.741722  0.565657     0.641834      198
I-Org     0.674157  0.497925     0.572792      241
B-Other   0.790476  0.624060     0.697479      133
I-Other   0.927083  0.684615     0.787611      130
"""

In [None]:
random_forest_clf, random_forest_result = run_classifier_bio(
    RandomForestClassifier(n_estimators=20, verbose=1), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
         precision    recall  fbeta_score  support
O         0.844645  0.998120     0.914991     6384
B-Loc     0.748148  0.473068     0.579627      427
I-Loc     0.955224  0.312195     0.470588      205
B-Peop    0.843243  0.485981     0.616601      321
I-Peop    0.919149  0.585366     0.715232      369
B-Org     0.807692  0.106061     0.187500      198
I-Org     0.894737  0.070539     0.130769      241
B-Other   0.950000  0.142857     0.248366      133
I-Other   0.976190  0.315385     0.476744      130
"""

In [None]:
random_forest_clf, random_forest_result = run_classifier_bio(
    RandomForestClassifier(n_estimators=20, class_weight="balanced", verbose=1), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
         precision    recall  fbeta_score  support
O         0.815591  0.999687     0.898304     6384
B-Loc     0.893443  0.255269     0.397086      427
I-Loc     1.000000  0.278049     0.435115      205
B-Peop    0.943662  0.417445     0.578834      321
I-Peop    0.960674  0.463415     0.625229      369
B-Org     0.944444  0.085859     0.157407      198
I-Org     1.000000  0.053942     0.102362      241
B-Other   0.950000  0.142857     0.248366      133
I-Other   1.000000  0.253846     0.404908      130
"""

In [None]:
random_forest_clf, random_forest_result = run_classifier_bio(
    RandomForestClassifier(n_estimators=20, class_weight="balanced_subsample", verbose=1), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
         precision    recall  fbeta_score  support
O         0.817425  0.999373     0.899288     6384
B-Loc     0.868217  0.262295     0.402878      427
I-Loc     0.982456  0.273171     0.427481      205
B-Peop    0.964789  0.426791     0.591793      321
I-Peop    0.953125  0.495935     0.652406      369
B-Org     1.000000  0.095960     0.175115      198
I-Org     1.000000  0.037344     0.072000      241
B-Other   0.947368  0.135338     0.236842      133
I-Other   1.000000  0.276923     0.433735      130
"""

In [None]:
nn_clf, nn_result = run_classifier_bio(
    MLPClassifier((512,), verbose=True), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
         precision    recall  fbeta_score  support
O         0.981159  0.978853     0.980005     6384
B-Loc     0.913753  0.918033     0.915888      427
I-Loc     0.893855  0.780488     0.833333      205
B-Peop    0.944625  0.903427     0.923567      321
I-Peop    0.937008  0.967480     0.952000      369
B-Org     0.737557  0.823232     0.778043      198
I-Org     0.693069  0.871369     0.772059      241
B-Other   0.838095  0.661654     0.739496      133
I-Other   0.877193  0.769231     0.819672      130
"""

In [None]:
nn_clf, nn_result = run_classifier_bio(
    MLPClassifier((1024,), verbose=True), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
         precision    recall  fbeta_score  support
O         0.976516  0.983553     0.980022     6384
B-Loc     0.919908  0.941452     0.930556      427
I-Loc     0.875706  0.756098     0.811518      205
B-Peop    0.958466  0.934579     0.946372      321
I-Peop    0.954787  0.972900     0.963758      369
B-Org     0.814433  0.797980     0.806122      198
I-Org     0.759843  0.800830     0.779798      241
B-Other   0.818966  0.714286     0.763052      133
I-Other   0.855856  0.730769     0.788382      130
"""