In [None]:
import numpy as np
import pandas as pd
from transformers import BertModel
import torch
from torch.nn import CrossEntropyLoss

In [None]:
import sys
sys.path.append("..\\parser")
import internal_parser

In [None]:
from ipywidgets import IntProgress
from IPython.display import display

In [None]:
# training_raw = internal_parser.extract_data(internal_parser.get_docs("Training"))

In [None]:
# test_raw = internal_parser.extract_data(internal_parser.get_docs("Test"))

In [None]:
# bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def transform_data(
    raw_data, 
    pretrain_model, 
    ignore_index=CrossEntropyLoss().ignore_index,
    max_token_count=512,
    cls_token=internal_parser.CLS_TOKEN,
    sep_token=internal_parser.SEP_TOKEN
):
    """Transform the parsed dataset with a pre-trained model
    Only the first token of each word is labeled, the others are masked as 'ignore_index'
    The label of O is 0
    The label of I is the negation of the corresponding label of B
    """
    progress = IntProgress(min=0, max=len(raw_data)) # instantiate the bar
    display(progress) # display the bar
    
    padding_token_count = (1 if cls_token else 0) + (1 if sep_token else 0)
    
    transformed_tokens = []
    true_labels = []
    true_words = []
    
    for document in raw_data:
        progress.value += 1
        tokens = document["data_frame"]["token_ids"].tolist()
        begins = document["data_frame"]["begins"].tolist()
        ends = document["data_frame"]["ends"].tolist()
        labels = document["data_frame"]["entity_embedding"].tolist()
        words = document["data_frame"]["words"].tolist()
        sentence_embedding = document["data_frame"]["sentence_embedding"].tolist()
        
        for i in range(len(tokens)):
            if i > 0 and begins[i] == begins[i-1] and ends[i] == ends[i-1]:
                # Extra tokens from the same word are ignored
                labels[i] = ignore_index
                
        for entity in document["entity_position"]:
            begin, end = document["entity_position"][entity]
            for i in range(begin + 1, end):
                # Every subsequence word of an entity is label as I instead of B
                if labels[i] != ignore_index:
                    labels[i] = -labels[i]
                    
        # print(list(zip(document["data_frame"]["words"].tolist(), labels)))
        i = 0
        while i < len(tokens):
            j = i
            while j < len(tokens) and sentence_embedding[i] == sentence_embedding[j] and j - i < max_token_count-padding_token_count:
                j += 1
            # Segment the document and encode with the pre-trained model
            inputs = tokens[i:j]
            tmp_labels = labels[i:j]
            tmp_words = words[i:j]
            if cls_token: 
                inputs = [cls_token] + inputs
                tmp_labels = [ignore_index] + tmp_labels
                tmp_words = ["[CLS]"] + tmp_words
            if sep_token:
                inputs.append(sep_token)
                tmp_labels.append(ignore_index)
                tmp_words.append("[SEP]")
            outputs = pretrain_model(
                input_ids=torch.tensor([inputs]), 
                token_type_ids=torch.tensor([[0] * len(inputs)]),
                attention_mask=torch.tensor([[1] * len(inputs)])
            )
            transformed_tokens += outputs.last_hidden_state[0].tolist()
            true_labels += tmp_labels
            true_words += tmp_words
            i = j
            
    assert len(transformed_tokens) == len(true_labels) == len(true_words)
    return pd.DataFrame(transformed_tokens), pd.DataFrame(list(zip(true_labels, true_words)))

In [None]:
# training_tokens, training_labels = transform_data(training_raw, bert_model)
# print("Saving training tokens of shape", training_tokens.shape)
# training_tokens.to_csv("training_tokens.csv", index=False)
# print("Saving training labels of shape", training_labels.shape)
# training_labels.to_csv("training_labels.csv", index=False)

In [None]:
# test_tokens, test_labels = transform_data(test_raw, bert_model)
# print("Saving test tokens of shape", test_tokens.shape)
# test_tokens.to_csv("test_tokens.csv", index=False)
# print("Saving test labels of shape", test_labels.shape)
# test_labels.to_csv("test_labels.csv", index=False)

In [None]:
training_tokens = pd.read_csv("sentence_by_sentence\\training_tokens.csv")
training_labels = pd.read_csv("sentence_by_sentence\\training_labels.csv")
test_tokens = pd.read_csv("sentence_by_sentence\\test_tokens.csv")
test_labels = pd.read_csv("sentence_by_sentence\\test_labels.csv")

In [None]:
print(training_tokens.shape, training_labels.shape, test_tokens.shape, test_labels.shape)

In [None]:
training_tokens = training_tokens[training_labels["0"] != -100]
training_labels = training_labels[training_labels["0"] != -100]

In [None]:
test_tokens = test_tokens[test_labels["0"] != -100]
test_labels = test_labels[test_labels["0"] != -100]

In [None]:
# # Test on both train and test datasets
# test_tokens = pd.concat([training_tokens, test_tokens], ignore_index=True)
# test_labels = pd.concat([training_labels, test_labels], ignore_index=True)

In [None]:
print(training_tokens.shape, training_labels.shape, test_tokens.shape, test_labels.shape)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
# label_map = {v: k for k, v in internal_parser.entity_encode.items()}

In [None]:
# def run_classifier(clf, x_train, y_train, x_test, y_test):
#     y_train = y_train.abs()
#     y_test = y_test.abs()
    
#     print("Fitting...")
#     clf.fit(x_train, y_train)
#     print("Predicting...")
#     y_pred = clf.predict(x_test)
    
#     print("Results:")
#     precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=clf.classes_)
#     result = pd.DataFrame(index=[label_map[label] for label in clf.classes_])
#     result["precision"] = precision
#     result["recall"] = recall
#     result["fbeta_score"] = fbeta_score
#     result["support"] = support
#     print(result)
    
#     return clf, result

In [None]:
# kneighbor_clf, kneighbor_result = run_classifier(
#     KNeighborsClassifier(), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.971389  0.971116     0.971253    46289
EnvironmentalIssues   0.822701  0.865340     0.843482     2161
Date                  0.970588  0.958838     0.964677      413
Organisation          0.723154  0.901674     0.802607     1434
CommitmentLevel       0.586402  0.407480     0.480836     1016
Location              0.797665  0.615616     0.694915      333
CoalActivity          0.916667  0.846154     0.880000       26
SocialIssues          0.877734  0.860836     0.869203     1818
SocialOfficialTexts   0.788360  0.696262     0.739454      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.970356  0.971635     0.970995    46289
EnvironmentalIssues   0.811168  0.880611     0.844464     2161
Date                  0.961071  0.956416     0.958738      413
Organisation          0.775295  0.870990     0.820361     1434
CommitmentLevel       0.574176  0.411417     0.479358     1016
Location              0.790875  0.624625     0.697987      333
CoalActivity          0.863636  0.730769     0.791667       26
SocialIssues          0.881087  0.855886     0.868304     1818
SocialOfficialTexts   0.714976  0.691589     0.703088      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.978632  0.982483     0.980554   187072
EnvironmentalIssues   0.903884  0.906873     0.905376     9675
Date                  0.984105  0.982726     0.983415     2142
Organisation          0.914662  0.963587     0.938487    12825
CommitmentLevel       0.736190  0.519101     0.608874     4005
Location              0.891799  0.841900     0.866131     1537
CoalActivity          0.971831  0.663462     0.788571      104
SocialIssues          0.919010  0.902369     0.910614     6627
SocialOfficialTexts   0.839286  0.799073     0.818686      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.978382  0.982926     0.980649   187072
EnvironmentalIssues   0.899106  0.914625     0.906799     9675
Date                  0.981716  0.977591     0.979649     2142
Organisation          0.927483  0.958363     0.942670    12825
CommitmentLevel       0.725000  0.514107     0.601607     4005
Location              0.887130  0.838647     0.862207     1537
CoalActivity          0.941176  0.615385     0.744186      104
SocialIssues          0.923255  0.902218     0.912615     6627
SocialOfficialTexts   0.823529  0.800618     0.811912      647
"""

In [None]:
# decision_tree_clf, decision_tree_result = run_classifier(
#     DecisionTreeClassifier(), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.929607  0.914645     0.922065    46289
EnvironmentalIssues   0.528506  0.587691     0.556529     2161
Date                  0.609572  0.585956     0.597531      413
Organisation          0.362570  0.495816     0.418851     1434
CommitmentLevel       0.140426  0.162402     0.150616     1016
Location              0.237458  0.213213     0.224684      333
CoalActivity          0.129032  0.153846     0.140351       26
SocialIssues          0.553613  0.522552     0.537634     1818
SocialOfficialTexts   0.252809  0.210280     0.229592      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.930128  0.912204     0.921078    46289
EnvironmentalIssues   0.518117  0.575659     0.545375     2161
Date                  0.537778  0.585956     0.560834      413
Organisation          0.350398  0.491632     0.409170     1434
CommitmentLevel       0.131424  0.153543     0.141625     1016
Location              0.210909  0.174174     0.190789      333
CoalActivity          0.156250  0.192308     0.172414       26
SocialIssues          0.548997  0.526953     0.537749     1818
SocialOfficialTexts   0.229268  0.219626     0.224344      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.982745  0.979425     0.981082   187072
EnvironmentalIssues   0.890285  0.908320     0.899212     9675
Date                  0.928807  0.919701     0.924232     2142
Organisation          0.907019  0.943158     0.924735    12825
CommitmentLevel       0.766480  0.786767     0.776491     4005
Location              0.840714  0.827586     0.834098     1537
CoalActivity          0.689655  0.769231     0.727273      104
SocialIssues          0.885301  0.868870     0.877009     6627
SocialOfficialTexts   0.782958  0.752705     0.767533      647
"""

In [None]:
# random_forest_clf, random_forest_result = run_classifier(
#     RandomForestClassifier(n_estimators=20, max_depth=10, verbose=1), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.891979  0.998985     0.942454    46289
EnvironmentalIssues   0.973034  0.400740     0.567683     2161
Date                  0.975309  0.382567     0.549565      413
Organisation          0.944118  0.223849     0.361894     1434
CommitmentLevel       0.000000  0.000000     0.000000     1016
Location              0.000000  0.000000     0.000000      333
CoalActivity          0.000000  0.000000     0.000000       26
SocialIssues          0.978723  0.253025     0.402098     1818
SocialOfficialTexts   0.000000  0.000000     0.000000      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.889877  0.998898     0.941241    46289
EnvironmentalIssues   0.973269  0.370662     0.536863     2161
Date                  0.988166  0.404358     0.573883      413
Organisation          0.916914  0.215481     0.348955     1434
CommitmentLevel       0.000000  0.000000     0.000000     1016
Location              1.000000  0.009009     0.017857      333
CoalActivity          0.000000  0.000000     0.000000       26
SocialIssues          0.963504  0.217822     0.355316     1818
SocialOfficialTexts   1.000000  0.004673     0.009302      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.890060  0.999305     0.941524   187072
EnvironmentalIssues   0.993397  0.450956     0.620317     9675
Date                  0.998213  0.521475     0.685066     2142
Organisation          0.984903  0.534113     0.692619    12825
CommitmentLevel       0.000000  0.000000     0.000000     4005
Location              1.000000  0.027326     0.053198     1537
CoalActivity          0.000000  0.000000     0.000000      104
SocialIssues          0.978947  0.308737     0.469428     6627
SocialOfficialTexts   1.000000  0.004637     0.009231      647
"""

In [None]:
# random_forest_clf, random_forest_result = run_classifier(
#     RandomForestClassifier(n_estimators=20, max_depth=30, verbose=1), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.915483  0.995939     0.954018    46289
EnvironmentalIssues   0.945131  0.565942     0.707959     2161
Date                  0.965116  0.602906     0.742176      413
Organisation          0.851613  0.460251     0.597555     1434
CommitmentLevel       0.958333  0.022638     0.044231     1016
Location              0.968750  0.093093     0.169863      333
CoalActivity          0.000000  0.000000     0.000000       26
SocialIssues          0.952077  0.491749     0.648531     1818
SocialOfficialTexts   0.880000  0.102804     0.184100      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.912848  0.996306     0.952753    46289
EnvironmentalIssues   0.954211  0.540028     0.689716     2161
Date                  0.980916  0.622276     0.761481      413
Organisation          0.818919  0.422594     0.557498     1434
CommitmentLevel       0.842105  0.015748     0.030918     1016
Location              1.000000  0.099099     0.180328      333
CoalActivity          1.000000  0.076923     0.142857       26
SocialIssues          0.964571  0.464246     0.626810     1818
SocialOfficialTexts   0.896552  0.121495     0.213992      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.965503  0.999097     0.982012   187072
EnvironmentalIssues   0.993411  0.857054     0.920209     9675
Date                  0.996397  0.903828     0.947858     2142
Organisation          0.991076  0.926550     0.957727    12825
CommitmentLevel       0.999418  0.428464     0.599790     4005
Location              1.000000  0.781392     0.877283     1537
CoalActivity          1.000000  0.682692     0.811429      104
SocialIssues          0.992024  0.807002     0.889998     6627
SocialOfficialTexts   0.992366  0.602782     0.750000      647
"""

In [None]:
# random_forest_clf, random_forest_result = run_classifier(
#     RandomForestClassifier(n_estimators=20, max_depth=100, verbose=1), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.914231  0.996868     0.953763    46289
EnvironmentalIssues   0.959016  0.541416     0.692103     2161
Date                  0.984556  0.617433     0.758929      413
Organisation          0.879720  0.438633     0.585389     1434
CommitmentLevel       0.970588  0.032480     0.062857     1016
Location              0.914286  0.096096     0.173913      333
CoalActivity          0.000000  0.000000     0.000000       26
SocialIssues          0.955414  0.495050     0.652174     1818
SocialOfficialTexts   0.923077  0.112150     0.200000      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.910546  0.996587     0.951626    46289
EnvironmentalIssues   0.950503  0.524294     0.675813     2161
Date                  0.968254  0.590799     0.733835      413
Organisation          0.839150  0.385635     0.528428     1434
CommitmentLevel       0.920000  0.022638     0.044188     1016
Location              0.942857  0.099099     0.179348      333
CoalActivity          1.000000  0.038462     0.074074       26
SocialIssues          0.958872  0.448845     0.611465     1818
SocialOfficialTexts   0.923077  0.112150     0.200000      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.976296  0.999123     0.987578   187072
EnvironmentalIssues   0.994241  0.892196     0.940459     9675
Date                  0.998484  0.922502     0.958991     2142
Organisation          0.991063  0.942456     0.966148    12825
CommitmentLevel       0.998622  0.723596     0.839149     4005
Location              1.000000  0.802212     0.890253     1537
CoalActivity          1.000000  0.740385     0.850829      104
SocialIssues          0.992764  0.848800     0.915155     6627
SocialOfficialTexts   0.991170  0.693972     0.816364      647
"""

In [None]:
# random_forest_clf, random_forest_result = run_classifier(
#     RandomForestClassifier(n_estimators=20, verbose=1), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.913472  0.996198     0.953043    46289
EnvironmentalIssues   0.951259  0.541879     0.690448     2161
Date                  0.960938  0.595642     0.735426      413
Organisation          0.875335  0.455370     0.599083     1434
CommitmentLevel       0.945946  0.034449     0.066477     1016
Location              1.000000  0.084084     0.155125      333
CoalActivity          0.000000  0.000000     0.000000       26
SocialIssues          0.949115  0.471947     0.630419     1818
SocialOfficialTexts   0.952381  0.093458     0.170213      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.910945  0.996630     0.951863    46289
EnvironmentalIssues   0.952579  0.529847     0.680940     2161
Date                  0.984314  0.607748     0.751497      413
Organisation          0.843137  0.389819     0.533143     1434
CommitmentLevel       1.000000  0.020669     0.040501     1016
Location              0.968750  0.093093     0.169863      333
CoalActivity          0.000000  0.000000     0.000000       26
SocialIssues          0.962353  0.449945     0.613193     1818
SocialOfficialTexts   0.947368  0.168224     0.285714      214

128 by 128, train+test results:
None                  0.975577  0.999097     0.987197   187072
EnvironmentalIssues   0.992616  0.889199     0.938066     9675
Date                  0.997979  0.922035     0.958505     2142
Organisation          0.991807  0.934503     0.962303    12825
CommitmentLevel       0.998602  0.713358     0.832217     4005
Location              0.998375  0.799610     0.888006     1537
CoalActivity          1.000000  0.730769     0.844444      104
SocialIssues          0.993343  0.855591     0.919335     6627
SocialOfficialTexts   0.995516  0.686244     0.812443      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.975022  0.999086     0.986907   187072
EnvironmentalIssues   0.993875  0.888889     0.938455     9675
Date                  0.995450  0.919234     0.955825     2142
Organisation          0.991126  0.931852     0.960575    12825
CommitmentLevel       0.999650  0.713109     0.832410     4005
Location              0.992771  0.804164     0.888569     1537
CoalActivity          1.000000  0.721154     0.837989      104
SocialIssues          0.992895  0.843519     0.912132     6627
SocialOfficialTexts   0.997773  0.692427     0.817518      647
"""

In [None]:
# random_forest_clf, random_forest_result = run_classifier(
#     RandomForestClassifier(n_estimators=20, class_weight="balanced", verbose=1), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.904571  0.998293     0.949124    46289
EnvironmentalIssues   0.978452  0.462286     0.627907     2161
Date                  0.980769  0.617433     0.757801      413
Organisation          0.925754  0.278243     0.427882     1434
CommitmentLevel       0.842105  0.015748     0.030918     1016
Location              0.939394  0.093093     0.169399      333
CoalActivity          1.000000  0.230769     0.375000       26
SocialIssues          0.972396  0.426293     0.592734     1818
SocialOfficialTexts   0.884615  0.214953     0.345865      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.972730  0.999519     0.985942   187072
EnvironmentalIssues   0.996226  0.873075     0.930594     9675
Date                  0.997983  0.923903     0.959515     2142
Organisation          0.997026  0.914776     0.954131    12825
CommitmentLevel       0.999303  0.716355     0.834497     4005
Location              0.998367  0.795706     0.885590     1537
CoalActivity          1.000000  0.778846     0.875676      104
SocialIssues          0.996381  0.830994     0.906204     6627
SocialOfficialTexts   0.985325  0.726430     0.836299      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.972124  0.999551     0.985647   187072
EnvironmentalIssues   0.996204  0.867907     0.927640     9675
Date                  0.997981  0.922969     0.959010     2142
Organisation          0.997016  0.911969     0.952598    12825
CommitmentLevel       0.998243  0.709114     0.829197     4005
Location              0.999183  0.795706     0.885911     1537
CoalActivity          1.000000  0.798077     0.887701      104
SocialIssues          0.997280  0.829787     0.905856     6627
SocialOfficialTexts   0.993711  0.732612     0.843416      647
"""

In [None]:
# random_forest_clf, random_forest_result = run_classifier(
#     RandomForestClassifier(n_estimators=20, class_weight="balanced_subsample", verbose=1), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.904962  0.998315     0.949349    46289
EnvironmentalIssues   0.975822  0.466913     0.631612     2161
Date                  0.980392  0.605327     0.748503      413
Organisation          0.922222  0.289400     0.440552     1434
CommitmentLevel       0.952381  0.019685     0.038573     1016
Location              0.937500  0.090090     0.164384      333
CoalActivity          1.000000  0.192308     0.322581       26
SocialIssues          0.974555  0.421342     0.588326     1818
SocialOfficialTexts   0.929825  0.247664     0.391144      214

128 by 128, train+test result:
                     precision    recall  fbeta_score  support
None                  0.972943  0.999540     0.986062   187072
EnvironmentalIssues   0.997045  0.871731     0.930186     9675
Date                  0.996965  0.920168     0.957028     2142
Organisation          0.997033  0.916959     0.955321    12825
CommitmentLevel       0.998951  0.713358     0.832338     4005
Location              0.998371  0.797658     0.886799     1537
CoalActivity          1.000000  0.798077     0.887701      104
SocialIssues          0.996046  0.836276     0.909195     6627
SocialOfficialTexts   0.983402  0.732612     0.839681      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.971904  0.999465     0.985492   187072
EnvironmentalIssues   0.995599  0.865220     0.925842     9675
Date                  0.995459  0.921102     0.956838     2142
Organisation          0.996419  0.911111     0.951857    12825
CommitmentLevel       0.998949  0.712110     0.831487     4005
Location              0.999187  0.799610     0.888327     1537
CoalActivity          1.000000  0.778846     0.875676      104
SocialIssues          0.996544  0.826618     0.903662     6627
SocialOfficialTexts   0.991614  0.731066     0.841637      647
"""

In [None]:
# nn_clf, nn_result = run_classifier(
#     MLPClassifier((64,), verbose=True), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.961467  0.971894     0.966652    46289
EnvironmentalIssues   0.807397  0.828320     0.817725     2161
Date                  0.958637  0.953995     0.956311      413
Organisation          0.847050  0.760809     0.801616     1434
CommitmentLevel       0.446009  0.374016     0.406852     1016
Location              0.758741  0.651652     0.701131      333
CoalActivity          0.714286  0.961538     0.819672       26
SocialIssues          0.884005  0.796480     0.837963     1818
SocialOfficialTexts   0.833333  0.724299     0.775000      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.960847  0.970209     0.965505    46289
EnvironmentalIssues   0.781803  0.838963     0.809375     2161
Date                  0.946262  0.980630     0.963139      413
Organisation          0.811195  0.677127     0.738122     1434
CommitmentLevel       0.437500  0.316929     0.367580     1016
Location              0.769006  0.789790     0.779259      333
CoalActivity          0.687500  0.846154     0.758621       26
SocialIssues          0.875291  0.826183     0.850028     1818
SocialOfficialTexts   0.783505  0.710280     0.745098      214

128 by 128, train+test result:
                     precision    recall  fbeta_score  support
None                  0.988364  0.992548     0.990452   187072
EnvironmentalIssues   0.958139  0.953385     0.955756     9675
Date                  0.994364  0.988329     0.991337     2142
Organisation          0.981952  0.958752     0.970213    12825
CommitmentLevel       0.867334  0.822722     0.844439     4005
Location              0.932484  0.952505     0.942388     1537
CoalActivity          0.959184  0.903846     0.930693      104
SocialIssues          0.960888  0.934209     0.947360     6627
SocialOfficialTexts   0.936426  0.842349     0.886900      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.989443  0.992965     0.991201   187072
EnvironmentalIssues   0.949024  0.960207     0.954583     9675
Date                  0.978851  0.993931     0.986333     2142
Organisation          0.982204  0.963977     0.973005    12825
CommitmentLevel       0.893427  0.824719     0.857699     4005
Location              0.952824  0.932986     0.942801     1537
CoalActivity          0.943925  0.971154     0.957346      104
SocialIssues          0.971857  0.937981     0.954619     6627
SocialOfficialTexts   0.932813  0.922720     0.927739      647
"""

In [None]:
# nn_clf, nn_result = run_classifier(
#     MLPClassifier((512,), verbose=True), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.960393  0.981162     0.970666    46289
EnvironmentalIssues   0.852666  0.843591     0.848104     2161
Date                  0.980440  0.970944     0.975669      413
Organisation          0.855167  0.732915     0.789335     1434
CommitmentLevel       0.568659  0.346457     0.430581     1016
Location              0.851986  0.708709     0.773770      333
CoalActivity          0.814815  0.846154     0.830189       26
SocialIssues          0.922476  0.778878     0.844617     1818
SocialOfficialTexts   0.844444  0.710280     0.771574      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.963631  0.973644     0.968611    46289
EnvironmentalIssues   0.789941  0.864877     0.825712     2161
Date                  0.957647  0.985472     0.971360      413
Organisation          0.865100  0.693166     0.769648     1434
CommitmentLevel       0.512748  0.356299     0.420441     1016
Location              0.816609  0.708709     0.758842      333
CoalActivity          0.956522  0.846154     0.897959       26
SocialIssues          0.876076  0.839934     0.857624     1818
SocialOfficialTexts   0.708155  0.771028     0.738255      214

128 by 128, train+test result:
                     precision    recall  fbeta_score  support
None                  0.987384  0.995675     0.991512   187072
EnvironmentalIssues   0.966413  0.957623     0.961998     9675
Date                  0.993470  0.994398     0.993934     2142
Organisation          0.988165  0.957037     0.972352    12825
CommitmentLevel       0.933845  0.789513     0.855635     4005
Location              0.966420  0.936239     0.951091     1537
CoalActivity          0.970874  0.961538     0.966184      104
SocialIssues          0.984558  0.923646     0.953130     6627
SocialOfficialTexts   0.931677  0.927357     0.929512      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.989774  0.993960     0.991863   187072
EnvironmentalIssues   0.963218  0.952765     0.957963     9675
Date                  0.994353  0.986461     0.990391     2142
Organisation          0.987123  0.968343     0.977643    12825
CommitmentLevel       0.895687  0.829713     0.861439     4005
Location              0.976616  0.923878     0.949515     1537
CoalActivity          0.926606  0.971154     0.948357      104
SocialIssues          0.963446  0.958503     0.960968     6627
SocialOfficialTexts   0.936407  0.910355     0.923197      64
"""

In [None]:
# nn_clf, nn_result = run_classifier(
#     MLPClassifier((1024,), verbose=True), 
#     training_tokens, training_labels["0"], test_tokens, test_labels["0"]
# )

r = """
128 by 128 results:
                     precision    recall  fbeta_score  support
None                  0.962570  0.978893     0.970663    46289
EnvironmentalIssues   0.822459  0.857473     0.839601     2161
Date                  0.987775  0.978208     0.982968      413
Organisation          0.863158  0.743375     0.798801     1434
CommitmentLevel       0.565287  0.349409     0.431873     1016
Location              0.839590  0.738739     0.785942      333
CoalActivity          0.676471  0.884615     0.766667       26
SocialIssues          0.906962  0.809681     0.855565     1818
SocialOfficialTexts   0.890323  0.644860     0.747967      214

Sentence by sentence results:
                     precision    recall  fbeta_score  support
None                  0.964718  0.975847     0.970251    46289
EnvironmentalIssues   0.827446  0.845442     0.836347     2161
Date                  0.961446  0.966102     0.963768      413
Organisation          0.842620  0.780335     0.810282     1434
CommitmentLevel       0.517568  0.376969     0.436219     1016
Location              0.861538  0.672673     0.755481      333
CoalActivity          0.766667  0.884615     0.821429       26
SocialIssues          0.880503  0.847085     0.863471     1818
SocialOfficialTexts   0.960265  0.677570     0.794521      214

128 by 128, train+test results:
                     precision    recall  fbeta_score  support
None                  0.990990  0.993035     0.992011   187072
EnvironmentalIssues   0.957303  0.964031     0.960655     9675
Date                  0.995331  0.995331     0.995331     2142
Organisation          0.984171  0.974425     0.979274    12825
CommitmentLevel       0.908468  0.827715     0.866214     4005
Location              0.953978  0.944047     0.948986     1537
CoalActivity          0.989796  0.932692     0.960396      104
SocialIssues          0.944354  0.962879     0.953527     6627
SocialOfficialTexts   0.945423  0.829985     0.883951      647

Sentence by sentence, train+test results:
                     precision    recall  fbeta_score  support
None                  0.989788  0.994799     0.992287   187072
EnvironmentalIssues   0.966187  0.956899     0.961520     9675
Date                  0.994408  0.996265     0.995336     2142
Organisation          0.985677  0.960468     0.972909    12825
CommitmentLevel       0.922949  0.834457     0.876475     4005
Location              0.957405  0.950553     0.953967     1537
CoalActivity          0.961538  0.961538     0.961538      104
SocialIssues          0.972698  0.951562     0.962014     6627
SocialOfficialTexts   0.920732  0.933539     0.927091      647
"""

In [None]:
# The BIO embedding is used for further relation extraction

label_map_bio = {}
for key in internal_parser.entity_encode:
    if internal_parser.entity_encode[key] == 0:
        label_map_bio[0] = "O"
    else:
        label_map_bio[internal_parser.entity_encode[key]] = "B-" + key
        label_map_bio[-internal_parser.entity_encode[key]] = "I-" + key

In [None]:
def run_classifier_bio(clf, x_train, y_train, x_test, y_test):
    print("Fitting...")
    clf.fit(x_train, y_train)
    print("Predicting...")
    y_pred = clf.predict(x_test)
    
    print("Results:")
    precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=clf.classes_)
    result = pd.DataFrame(index=[label_map_bio[label] for label in clf.classes_])
    result["precision"] = precision
    result["recall"] = recall
    result["fbeta_score"] = fbeta_score
    result["support"] = support
    print(result)
    
    return clf, result

In [None]:
kneighbor_clf, kneighbor_result = run_classifier_bio(
    KNeighborsClassifier(), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
                       precision    recall  fbeta_score  support
I-SocialOfficialTexts   0.676471  0.672515     0.674487      171
I-SocialIssues          0.792706  0.673736     0.728395      613
I-CoalActivity          0.000000  0.000000     0.000000        4
I-Location              0.564103  0.415094     0.478261       53
I-CommitmentLevel       0.555556  0.424028     0.480962      283
I-Organisation          0.493590  0.696833     0.577861      221
I-Date                  0.854167  0.512500     0.640625       80
I-EnvironmentalIssues   0.643921  0.742489     0.689701      699
O                       0.969539  0.972283     0.970909    46289
B-EnvironmentalIssues   0.827720  0.874145     0.850299     1462
B-Date                  0.886740  0.963964     0.923741      333
B-Organisation          0.816165  0.849134     0.832323     1213
B-CommitmentLevel       0.571134  0.377899     0.454844      733
B-Location              0.762557  0.596429     0.669339      280
B-CoalActivity          0.772727  0.772727     0.772727       22
B-SocialIssues          0.822294  0.844813     0.833402     1205
B-SocialOfficialTexts   0.750000  0.697674     0.722892       43
"""

In [None]:
random_forest_clf, random_forest_result = run_classifier_bio(
    RandomForestClassifier(n_estimators=20, verbose=1), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
                       precision    recall  fbeta_score  support
I-SocialOfficialTexts   0.900000  0.157895     0.268657      171
I-SocialIssues          0.902174  0.270799     0.416562      613
I-CoalActivity          0.000000  0.000000     0.000000        4
I-Location              1.000000  0.037736     0.072727       53
I-CommitmentLevel       1.000000  0.038869     0.074830      283
I-Organisation          0.645161  0.180995     0.282686      221
I-Date                  1.000000  0.012500     0.024691       80
I-EnvironmentalIssues   0.857143  0.248927     0.385809      699
O                       0.906507  0.997688     0.949915    46289
B-EnvironmentalIssues   0.892936  0.553352     0.683277     1462
B-Date                  0.820833  0.591592     0.687609      333
B-Organisation          0.903491  0.362737     0.517647     1213
B-CommitmentLevel       0.800000  0.010914     0.021534      733
B-Location              0.931034  0.096429     0.174757      280
B-CoalActivity          0.000000  0.000000     0.000000       22
B-SocialIssues          0.876481  0.429876     0.576837     1205
B-SocialOfficialTexts   1.000000  0.069767     0.130435       43
"""

In [None]:
random_forest_clf, random_forest_result = run_classifier_bio(
    RandomForestClassifier(n_estimators=20, class_weight="balanced", verbose=1), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)
r = """
Results:
                       precision    recall  fbeta_score  support
I-SocialOfficialTexts   0.870968  0.157895     0.267327      171
I-SocialIssues          0.928177  0.274062     0.423174      613
I-CoalActivity          0.000000  0.000000     0.000000        4
I-Location              1.000000  0.037736     0.072727       53
I-CommitmentLevel       1.000000  0.031802     0.061644      283
I-Organisation          0.857143  0.054299     0.102128      221
I-Date                  1.000000  0.025000     0.048780       80
I-EnvironmentalIssues   0.890909  0.210300     0.340278      699
O                       0.897756  0.998898     0.945630    46289
B-EnvironmentalIssues   0.920168  0.449384     0.603860     1462
B-Date                  0.831967  0.609610     0.703640      333
B-Organisation          0.935860  0.264633     0.412596     1213
B-CommitmentLevel       1.000000  0.004093     0.008152      733
B-Location              1.000000  0.103571     0.187702      280
B-CoalActivity          1.000000  0.227273     0.370370       22
B-SocialIssues          0.916854  0.338589     0.494545     1205
B-SocialOfficialTexts   0.923077  0.279070     0.428571       43
"""

In [None]:
random_forest_clf, random_forest_result = run_classifier_bio(
    RandomForestClassifier(n_estimators=20, class_weight="balanced_subsample", verbose=1), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)
r = """
Results:
                       precision    recall  fbeta_score  support
I-SocialOfficialTexts   1.000000  0.152047     0.263959      171
I-SocialIssues          0.939560  0.278956     0.430189      613
I-CoalActivity          0.000000  0.000000     0.000000        4
I-Location              1.000000  0.056604     0.107143       53
I-CommitmentLevel       1.000000  0.017668     0.034722      283
I-Organisation          0.800000  0.072398     0.132780      221
I-Date                  1.000000  0.075000     0.139535       80
I-EnvironmentalIssues   0.900000  0.218884     0.352129      699
O                       0.897448  0.998963     0.945488    46289
B-EnvironmentalIssues   0.933237  0.439808     0.597861     1462
B-Date                  0.850427  0.597598     0.701940      333
B-Organisation          0.925697  0.246496     0.389323     1213
B-CommitmentLevel       0.714286  0.006821     0.013514      733
B-Location              1.000000  0.089286     0.163934      280
B-CoalActivity          1.000000  0.227273     0.370370       22
B-SocialIssues          0.934461  0.366805     0.526818     1205
B-SocialOfficialTexts   0.818182  0.209302     0.333333       43
"""

In [None]:
nn_clf, nn_result = run_classifier_bio(
    MLPClassifier((512,), verbose=True), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)
r = """
Results:
                       precision    recall  fbeta_score  support
I-SocialOfficialTexts   0.740541  0.801170     0.769663      171
I-SocialIssues          0.859275  0.657423     0.744917      613
I-CoalActivity          0.181818  0.500000     0.266667        4
I-Location              0.770833  0.698113     0.732673       53
I-CommitmentLevel       0.612903  0.335689     0.433790      283
I-Organisation          0.708520  0.714932     0.711712      221
I-Date                  0.841463  0.862500     0.851852       80
I-EnvironmentalIssues   0.658065  0.729614     0.691995      699
O                       0.962397  0.978656     0.970458    46289
B-EnvironmentalIssues   0.832656  0.840629     0.836624     1462
B-Date                  0.948485  0.939940     0.944193      333
B-Organisation          0.901316  0.677659     0.773647     1213
B-CommitmentLevel       0.516807  0.335607     0.406948      733
B-Location              0.850000  0.667857     0.748000      280
B-CoalActivity          0.857143  0.818182     0.837209       22
B-SocialIssues          0.844500  0.847303     0.845899     1205
B-SocialOfficialTexts   0.804878  0.767442     0.785714       43
"""

In [None]:
nn_clf, nn_result = run_classifier_bio(
    MLPClassifier((1024,), verbose=True), 
    training_tokens, training_labels["0"], test_tokens, test_labels["0"]
)

r = """
Results:
                       precision    recall  fbeta_score  support
I-SocialOfficialTexts   0.889706  0.707602     0.788274      171
I-SocialIssues          0.838446  0.668842     0.744102      613
I-CoalActivity          0.250000  0.250000     0.250000        4
I-Location              0.844444  0.716981     0.775510       53
I-CommitmentLevel       0.664000  0.293286     0.406863      283
I-Organisation          0.665025  0.610860     0.636792      221
I-Date                  0.881579  0.837500     0.858974       80
I-EnvironmentalIssues   0.718750  0.723891     0.721311      699
O                       0.960857  0.978937     0.969812    46289
B-EnvironmentalIssues   0.829317  0.847469     0.838295     1462
B-Date                  0.948949  0.948949     0.948949      333
B-Organisation          0.894209  0.661995     0.760777     1213
B-CommitmentLevel       0.476378  0.330150     0.390008      733
B-Location              0.905473  0.650000     0.756757      280
B-CoalActivity          0.894737  0.772727     0.829268       22
B-SocialIssues          0.829635  0.868880     0.848804     1205
B-SocialOfficialTexts   0.723404  0.790698     0.755556       43
"""