In [3]:
import numpy as np
import pandas as pd
from transformers import BertModel
import torch
from torch.nn import CrossEntropyLoss

In [4]:
import sys
sys.path.append("..\\parser")
import internal_parser

In [5]:
from ipywidgets import IntProgress
from IPython.display import display

In [6]:
training_raw = internal_parser.extract_data(internal_parser.get_docs("Training"))

In [7]:
test_raw = internal_parser.extract_data(internal_parser.get_docs("Test"))

In [8]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [12]:
def transform_data(
    raw_data, 
    pretrain_model, 
    ignore_index=CrossEntropyLoss().ignore_index,
    max_token_count=512,
    cls_token=internal_parser.CLS_TOKEN,
    sep_token=internal_parser.SEP_TOKEN
):
    """Transform the parsed dataset with a pre-trained model
    Only the first token of each word is labeled, the others are masked as 'ignore_index'
    The label of O is 0
    The label of I is the negation of the corresponding label of B
    """
    progress = IntProgress(min=0, max=len(raw_data)) # instantiate the bar
    display(progress) # display the bar
    
    padding_token_count = (1 if cls_token else 0) + (1 if sep_token else 0)
    
    transformed_tokens = []
    true_labels = []
    true_words = []
    
    for document in raw_data:
        progress.value += 1
        tokens = document["data_frame"]["token_ids"].tolist()
        begins = document["data_frame"]["begins"].tolist()
        ends = document["data_frame"]["ends"].tolist()
        labels = document["data_frame"]["entity_embedding"].tolist()
        words = document["data_frame"]["words"].tolist()
        sentence_embedding = document["data_frame"]["sentence_embedding"].tolist()
        
        for i in range(len(tokens)):
            if i > 0 and begins[i] == begins[i-1] and ends[i] == ends[i-1]:
                # Extra tokens from the same word are ignored
                labels[i] = ignore_index
                
        for entity in document["entity_position"]:
            begin, end = document["entity_position"][entity]
            for i in range(begin + 1, end):
                # Every subsequence word of an entity is label as I instead of B
                if labels[i] != ignore_index:
                    labels[i] = -labels[i]
                    
        # print(list(zip(document["data_frame"]["words"].tolist(), labels)))
        i = 0
        while i < len(tokens):
            j = i
            while j < len(tokens) and sentence_embedding[i] == sentence_embedding[j] and j - i < max_token_count-padding_token_count:
                j += 1
            # Segment the document and encode with the pre-trained model
            inputs = tokens[i:j]
            tmp_labels = labels[i:j]
            tmp_words = words[i:j]
            if cls_token: 
                inputs = [cls_token] + inputs
                tmp_labels = [ignore_index] + tmp_labels
                tmp_words = ["[CLS]"] + tmp_words
            if sep_token:
                inputs.append(sep_token)
                tmp_labels.append(ignore_index)
                tmp_words.append("[SEP]")
            outputs = pretrain_model(
                input_ids=torch.tensor([inputs]), 
                token_type_ids=torch.tensor([[0] * len(inputs)]),
                attention_mask=torch.tensor([[1] * len(inputs)])
            )
            transformed_tokens += outputs.last_hidden_state[0].tolist()
            true_labels += tmp_labels
            true_words += tmp_words
            i = j
            
    assert len(transformed_tokens) == len(true_labels) == len(true_words)
    return pd.DataFrame(transformed_tokens), pd.DataFrame(list(zip(true_labels, true_words)))

In [None]:
training_tokens, training_labels = transform_data(training_raw, bert_model)
print("Saving training tokens of shape", training_tokens.shape)
training_tokens.to_csv("training_tokens.csv", index=False)
print("Saving training labels of shape", training_labels.shape)
training_labels.to_csv("training_labels.csv", index=False)

IntProgress(value=0, max=288)

In [None]:
test_tokens, test_labels = transform_data(test_raw, bert_model)
print("Saving test tokens of shape", test_tokens.shape)
test_tokens.to_csv("test_tokens.csv", index=False)
print("Saving test labels of shape", test_labels.shape)
test_labels.to_csv("test_labels.csv", index=False)

In [None]:
training_tokens = pd.read_csv("128_by_128\\training_tokens.csv")
training_labels = pd.read_csv("128_by_128\\training_labels.csv")
test_tokens = pd.read_csv("128_by_128\\test_tokens.csv")
test_labels = pd.read_csv("128_by_128\\test_labels.csv")

In [None]:
print(training_tokens.shape, training_labels.shape, test_tokens.shape, test_labels.shape)

In [None]:
training_tokens = training_tokens[training_labels["0"] != -100]
training_labels = training_labels[training_labels["0"] != -100]

In [None]:
test_tokens = test_tokens[test_labels["0"] != -100]
test_labels = test_labels[test_labels["0"] != -100]

In [None]:
print(training_tokens.shape, training_labels.shape, test_tokens.shape, test_labels.shape)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
label_map = {v: k for k, v in internal_parser.entity_encode.items()}

In [None]:
def run_classifier(clf, x_train, y_train, x_test, y_test):
    y_train = y_train.abs()
    y_test = y_test.abs()
    
    print("Fitting...")
    clf.fit(x_train, y_train)
    print("Predicting...")
    y_pred = clf.predict(x_test)
    
    print("Results:")
    precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=clf.classes_)
    result = pd.DataFrame(index=[label_map[label] for label in clf.classes_])
    result["precision"] = precision
    result["recall"] = recall
    result["fbeta_score"] = fbeta_score
    result["support"] = support
    print(result)
    
    return clf, result

In [None]:
# kneighbor_clf, kneighbor_result = run_classifier(KNeighborsClassifier(), training_tokens, training_labels["0"], test_tokens, test_labels["0"])
# Results:
#                      precision    recall  fbeta_score  support
# None                  0.971389  0.971116     0.971253    46289
# EnvironmentalIssues   0.822701  0.865340     0.843482     2161
# Date                  0.970588  0.958838     0.964677      413
# Organisation          0.723154  0.901674     0.802607     1434
# CommitmentLevel       0.586402  0.407480     0.480836     1016
# Location              0.797665  0.615616     0.694915      333
# CoalActivity          0.916667  0.846154     0.880000       26
# SocialIssues          0.877734  0.860836     0.869203     1818
# SocialOfficialTexts   0.788360  0.696262     0.739454      214

In [None]:
# decision_tree_clf, decision_tree_result = run_classifier(DecisionTreeClassifier(), training_tokens, training_labels["0"], test_tokens, test_labels["0"])
# Results:
#                      precision    recall  fbeta_score  support
# None                  0.929607  0.914645     0.922065    46289
# EnvironmentalIssues   0.528506  0.587691     0.556529     2161
# Date                  0.609572  0.585956     0.597531      413
# Organisation          0.362570  0.495816     0.418851     1434
# CommitmentLevel       0.140426  0.162402     0.150616     1016
# Location              0.237458  0.213213     0.224684      333
# CoalActivity          0.129032  0.153846     0.140351       26
# SocialIssues          0.553613  0.522552     0.537634     1818
# SocialOfficialTexts   0.252809  0.210280     0.229592      214

In [None]:
# random_forest_clf, random_forest_result = run_classifier(RandomForestClassifier(n_estimators=10, max_depth=10, verbose=1), training_tokens, training_labels["0"], test_tokens, test_labels["0"])

In [None]:
# nn_clf, nn_result = run_classifier(MLPClassifier((512,), verbose=True), training_tokens, training_labels["0"], test_tokens, test_labels["0"])

In [None]:
# The BIO embedding is used for further relation extraction
#
# label_map_bio = {}
# for key in internal_parser.entity_encode:
#     if internal_parser.entity_encode[key] == 0:
#         label_map_bio[0] = "O"
#     else:
#         label_map_bio[internal_parser.entity_encode[key]] = "B-" + key
#         label_map_bio[-internal_parser.entity_encode[key]] = "I-" + key

In [None]:
# def run_classifier_bio(clf, x_train, y_train, x_test, y_test):
#     print("Fitting...")
#     clf.fit(x_train, y_train)
#     print("Predicting...")
#     y_pred = clf.predict(x_test)
    
#     print("Results:")
#     precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=clf.classes_)
#     result = pd.DataFrame(index=[label_map_bio[label] for label in clf.classes_])
#     result["precision"] = precision
#     result["recall"] = recall
#     result["fbeta_score"] = fbeta_score
#     result["support"] = support
#     print(result)
    
#     return clf, result

In [None]:
# kneighbor_clf, kneighbor_result = run_classifier_bio(KNeighborsClassifier(), training_tokens, training_labels["0"], test_tokens, test_labels["0"])

In [None]:
# decision_tree_clf, decision_tree_result = run_classifier_bio(DecisionTreeClassifier(), training_tokens, training_labels["0"], test_tokens, test_labels["0"])

In [None]:
# random_forest_clf, random_forest_result = run_classifier_bio(RandomForestClassifier(verbose=1), training_tokens, training_labels["0"], test_tokens, test_labels["0"])

In [None]:
# nn_clf, nn_result = run_classifier_bio(MLPClassifier((512,), verbose=True), training_tokens, training_labels["0"], test_tokens, test_labels["0"])