In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/nombank_partitives

/content/drive/My Drive/nombank_partitives


In [3]:
%cd nombank_partitives_proj

/content/drive/My Drive/nombank_partitives/nombank_partitives_proj


In [14]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from tools.arg_scorer import score_file_with_NNP_adjustment
from tree import *


In [15]:

train_filename = "percentage/train.data"
train_parse_filename = "gold_parses/%_nombank.clean.train.gold_parse"
train_baseline_filename = "percentage/train.features"
dev_filename = "percentage/dev.data"
dev_parse_filename = "gold_parses/%_nombank.clean.dev.gold_parse"
dev_baseline_filename = "percentage/dev.features"
dev_out_filename = "percentage/dev.out"
test_filename = "percentage/test.data"
test_parse_filename = "gold_parses/%_nombank.clean.test.gold_parse"
test_baseline_filename = "percentage/test.features"
test_out_filename = "percentage/test.out"

In [16]:


def load_stemming_dictionary(infile):
    stem_dict = dict()
    with open(infile) as instream:
        for line in instream:
            line = line.strip(os.linesep)
            outlist = line.lower().split(',')
            word = outlist[0]
            if word not in stem_dict:
                stem_dict[word] = outlist[1]
    return stem_dict

In [17]:


stemming_dict: dict[str, str] = load_stemming_dictionary(os.path.join('tools/morph-base-mapping.csv'))

In [18]:
def get_stemmed_word(word):
    global stemming_dict
    lower_word = word.lower()
    if lower_word in stemming_dict:
        return stemming_dict[lower_word]
    else:
        return lower_word

In [19]:

def normalize_partitive_class(feature):
    if feature == 'METONYM':
        return('MERONYM')  ## probably typo
    elif feature == 'INSTANCE':
        return('INSTANCE-OF-SET')
    elif feature.startswith('NOMADJ'):
        return('NOMADJ')
    elif feature.startswith('NOMLIKE'):
        return('NOM')
    elif feature == 'BODY-PART':
        return('PART-OF-BODY-FURNITURE-ETC')
    elif 'feature' in ['BOOK-CHAPTER','BORDER','DIVISION']:
        return('MERONYM')
    else:
        return(feature)

In [20]:
def get_path_features(filename, parse_filename):
    _paths = []

    with open(filename) as f:
        data = [x.strip() for x in f.readlines()]

    data_idx = 0

    with open(parse_filename) as f:
        buf = [x.strip() for x in f.readlines()]

    sentence_cnt = 0
    parse_file_cnt = 0
    no_dest_node = 0
    while parse_file_cnt < len(buf):
        # Ignore the header
        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) > 0:
            parse_file_cnt += 1
        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) == 0:
            parse_file_cnt += 1

        # Read the parse tree
        parse_tree = ""
        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) > 0:
            parse_tree += buf[parse_file_cnt]
            parse_file_cnt += 1
        parse_tree = parse_tree[1:-1]

        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) == 0:
            parse_file_cnt += 1

        tree = build_tree(parse_tree)
        leaves = find_leaves(tree, [])

        sentence_words = []
        while len(data[data_idx]) > 0:
            sentence_words.append(data[data_idx].split("\t"))
            data_idx += 1

        while data_idx < len(data) and len(data[data_idx]) == 0:
            data_idx += 1

        assert len(leaves) == len(sentence_words)

        # POS of parent and grandparent
        tmp = [dict() for _ in range(len(sentence_words))]
        for i in range(len(sentence_words)):
            tmp[i]["parent_pos"] = ""
            tmp[i]["grandparent_pos"] = ""
            line = sentence_words[i]
            parent = leaves[int(line[3])].parent
            if parent is not None:
                tmp[i]["parent_pos"] = parent.pos
                gran = parent.parent
                if gran is not None:
                    tmp[i]["grandparent_pos"] = gran.pos


        # Find paths from each word to pred
        dest_node = None
        for line in sentence_words:
            if len(line) > 5 and line[5] == "PRED":
                dest_node = leaves[int(line[3])]
                break

        if dest_node is None:
            no_dest_node += 1
        for i in range(len(sentence_words)):
            line = sentence_words[i]
            if dest_node is None:
                tmp[i]["pred_path"] = ""
            else:
                tmp[i]["pred_path"] = find_path_between_nodes(leaves[int(line[3])], dest_node, tree)

        # Find paths from each word to supprort
        dest_node = None
        for line in sentence_words:
            if len(line) > 5 and line[5] == "SUPPORT":
                dest_node = leaves[int(line[3])]
                break

        for i in range(len(sentence_words)):
            line = sentence_words[i]
            if dest_node is None:
                tmp[i]["support_path"] = ""
            else:
                tmp[i]["support_path"] = find_path_between_nodes(leaves[int(line[3])], dest_node, tree)

        sentence_cnt += 1

        _paths += tmp

    print(f"{no_dest_node} sentences with no predicate")

    paths = pd.DataFrame(_paths)

    return paths

In [21]:
def get_other_baseline_features(feature_filename, use_cache=False):
    if use_cache and os.path.exists(f"{feature_filename}.pkl"):
        return pd.read_pickle(f"{feature_filename}.pkl")
    useless_feature_list = set(["POS", "BIO", "stemmed_word", "word_back_1", "stemmed_word_back_1", "POS_back_1", "BIO_back_1", "word_back_2", "stemmed_word_back_2", "POS_back_2", "BIO_back_2", "word_plus_1", "stemmed_word_plus_1", "POS_plus_1", "BIO_plus_1", "word_plus_2", "stemmed_word_plus_2", "POS_plus_2", "BIO_plus_2", "word_plus_3", "stemmed_word_plus_3", "POS_plus_3", "BIO_plus_3", "REL_plus_1", "REL_plus_2", "REL_plus_3", "REL_back_1", "REL_back_2", "pred_path", "support_path"])
    with open(feature_filename) as f:
        buf = [x.strip() for x in f.readlines()]
    buf = [x for x in buf if len(x) > 0]

    _features = []
    _feature_freq = dict()
    for line in buf:
        feature_dict = dict()
        for x in line.split("\t")[1:]:
            _x = x.split("=")
            if len(_x) != 2:
                continue
            k = _x[0]
            v = _x[1]
            if k in useless_feature_list:
                continue
            else:
                feature_dict[k] = v
                if k not in _feature_freq:
                    _feature_freq[k] = 0
                _feature_freq[k] += 1
        _features.append(feature_dict)

    print("Constructing DataFrame")
    features = pd.DataFrame(_features, columns=[k for k, v in _feature_freq.items() if v >= 50])  # Delete useless features
    print(features.shape)

    for column in features.columns:
        value_count = features[column].value_counts()
        if value_count.shape[0] == 1:
            features[column] = (~features[column].isna()).astype(int)
        elif column == "relation_feature":
            features[column] = features[column].fillna("")
        else:
            features[column] = features[column].fillna(0).astype(np.float64)

    features.to_pickle(f"{feature_filename}.pkl")

    return features

In [22]:
path_features = get_path_features(dev_filename, dev_parse_filename)

0 sentences with no predicate


In [23]:
path_features

Unnamed: 0,parent_pos,grandparent_pos,pred_path,support_path
0,NP,S,DT↑NP↑S↓VP↓NP↓NP↓NP↓NN,DT↑NP↑S↓VP↓NP↓NP↓NN
1,NP,S,NN↑NP↑S↓VP↓NP↓NP↓NP↓NN,NN↑NP↑S↓VP↓NP↓NP↓NN
2,NP,S,NN↑NP↑S↓VP↓NP↓NP↓NP↓NN,NN↑NP↑S↓VP↓NP↓NP↓NN
3,VG,VP,VBZ↑VG↑VP↓NP↓NP↓NP↓NN,VBZ↑VG↑VP↓NP↓NP↓NN
4,NP,NP,DT↑NP↓NP↓NN,DT↑NP↓NN
...,...,...,...,...
2222,PP,VP,IN↑PP↑VP↓PP↓NP↓NN,
2223,NP,QP,$↑NP↑QP↑NP↑PP↑VP↓PP↓NP↓NN,
2224,NP,QP,CD↑NP↑QP↑NP↑PP↑VP↓PP↓NP↓NN,
2225,NP,QP,CD↑NP↑QP↑NP↑PP↑VP↓PP↓NP↓NN,


In [24]:
path_features.describe()

Unnamed: 0,parent_pos,grandparent_pos,pred_path,support_path
count,2227,2227,2227,2227.0
unique,17,11,1249,733.0
top,NP,NP,NN,
freq,1255,722,83,838.0


In [25]:


def get_heuristic_features(filename, parse_filename):
    _features = []
    _heuristics = ['support_verb_to_preceding_ARG1_heuristic',
                   'support_verb_to_following_ARG1_heuristic',
                   'predicate_noun_to_preceding_ARG1_heuristic',
                   'predicate_noun_to_following_ARG1_heuristic']

    with open(filename) as f:
        data = [x.strip() for x in f.readlines()]

    data_idx = 0

    with open(parse_filename) as f:
        buf = [x.strip() for x in f.readlines()]

    parse_file_cnt = 0

    while parse_file_cnt < len(buf):

        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) > 0:
            parse_file_cnt += 1
        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) == 0:
            parse_file_cnt += 1

        parse_tree = ""
        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) > 0:
            parse_tree += buf[parse_file_cnt]
            parse_file_cnt += 1
        parse_tree = parse_tree[1:-1]

        while parse_file_cnt < len(buf) and len(buf[parse_file_cnt]) == 0:
            parse_file_cnt += 1

        tree = build_tree(parse_tree)
        leaves = find_leaves(tree, [])

        sentence_words = []
        while len(data[data_idx]) > 0:
            sentence_words.append(data[data_idx].split("\t"))
            data_idx += 1

        while data_idx < len(data) and len(data[data_idx]) == 0:
            data_idx += 1

        assert len(leaves) == len(sentence_words)

        support_verb_position = None
        predicate_noun_position = None

        for i, word_data in enumerate(sentence_words):
            if len(word_data) > 5:
                if word_data[5] == 'SUPPORT':
                    support_verb_position = i
                elif word_data[5] == 'PRED':
                    predicate_noun_position = i

        tmp = [dict.fromkeys(_heuristics, 0) for _ in sentence_words]  # initialize heuristic features with 0

        for i, word_data in enumerate(sentence_words):
            if len(word_data) > 5 and 'ARG1' in word_data:
                if support_verb_position is not None:
                    if i < support_verb_position:
                        path = find_path_between_nodes(leaves[i], leaves[support_verb_position], tree)
                        tmp[i]['support_verb_to_preceding_ARG1_heuristic'] = int(path == '↑S↓NP↓Noun')
                    elif i > support_verb_position:
                        path = find_path_between_nodes(leaves[i], leaves[support_verb_position], tree)
                        tmp[i]['support_verb_to_following_ARG1_heuristic'] = int(path == '↑VP↓NP↓Noun')

                if predicate_noun_position is not None and support_verb_position is None:
                    path = find_path_between_nodes(leaves[i], leaves[predicate_noun_position], tree)
                    if i < predicate_noun_position:
                        tmp[i]['predicate_noun_to_preceding_ARG1_heuristic'] = int(path == '↑NP↓NP')
                    elif i > predicate_noun_position:
                        tmp[i]['predicate_noun_to_following_ARG1_heuristic'] = 1 # indictor that the word is ahead of the predicate, the feature will be the bio seq as per heuristics.

        _features += tmp

    features = pd.DataFrame(_features)
    return features


In [26]:
heuristic_features = get_heuristic_features(dev_filename, dev_parse_filename)

In [27]:
heuristic_features


Unnamed: 0,support_verb_to_preceding_ARG1_heuristic,support_verb_to_following_ARG1_heuristic,predicate_noun_to_preceding_ARG1_heuristic,predicate_noun_to_following_ARG1_heuristic
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
2222,0,0,0,0
2223,0,0,0,0
2224,0,0,0,0
2225,0,0,0,0


In [28]:
baseline_features = get_other_baseline_features(dev_baseline_filename, use_cache=True)

In [29]:
baseline_features.head()

Unnamed: 0,PARTITIVE-QUANT,unigram_embed_similarity,slash_unigram_embed_similarity,before_prep,before_support,before_pred,forward_bigram_embed_similarity,back_bigram_embed_similarity,forward_bigram_embed_slash_similarity,back_bigram_embed_slash_similarity,...,top_2_embed_slash_back_trigram,top_4_embed_forward_trigram,top_5_embed_slash_forward_bigram,top_2_embed_slash_forward_bigram,top_5_embed_unigram,top_5_embed_forward_bigram,top_4_embed_slash_forward_bigram,before_conj,after_conj,3_or_less_after_of
0,1,0.18083,0.32276,8.0,7.0,6.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1,1,0.51628,0.32143,7.0,6.0,5.0,0.43975,0.51181,0.33942,0.46002,...,0,0,0,0,0,0,0,0.0,0.0,0
2,1,0.37223,0.3186,6.0,5.0,4.0,0.5224,0.54984,0.3341,0.4637,...,0,0,0,0,0,0,0,0.0,0.0,0
3,1,0.6241,0.32161,5.0,4.0,3.0,0.5971,0.60593,0.3351,0.49285,...,0,0,0,0,0,0,0,0.0,0.0,0
4,1,0.26912,0.28588,4.0,3.0,2.0,0.50393,0.53072,0.30246,0.49563,...,0,0,0,0,0,0,0,0.0,0.0,0


In [30]:
def get_data(filename, parse_filename, baseline_filename):
    _data = []

    print("Reading file")

    with open(filename) as infile:
        buf = infile.read().splitlines()

    prefix = ["prev2_", "prev_", "", "next_", "next2_", "next3_"]

    print("Generating basic fields")

    sentence_words = [["SENTENCE_BREAK"] * 5] * 2
    for _line in buf:
        if len(_line) == 0:
            sentence_words += [["SENTENCE_BREAK"] * 5] * 3
            stem_words = [get_stemmed_word(x[0]) for x in sentence_words]
            found_pred = "False"
            for i in range(0, len(sentence_words) - 5):
                tmp = {"arg1": 0}
                for j in range(6):
                    tmp[prefix[j] + "word"] = sentence_words[i + j][0]
                    tmp[prefix[j] + "stem"] = stem_words[i + j]
                    tmp[prefix[j] + "bio"] = sentence_words[i + j][2]
                    tmp[prefix[j] + "pos"] = sentence_words[i + j][1]
                    if len(sentence_words[i + j]) > 5 and sentence_words[i + j][5] != "ARG1":
                            tmp[prefix[j] + "rel"] = sentence_words[i + j][5]

                line_features = sentence_words[i + 2]
                tmp["token_number"] = line_features[3]
                tmp["sentence_number"] = line_features[4]
                if len(line_features) > 5:
                    if line_features[5] == "PRED":
                        found_pred = "True"
                    if line_features[5] == "ARG1":
                        tmp["arg1"] = 1
                tmp["right_to_pred"] = found_pred

                _data.append(tmp)

            sentence_words = [["SENTENCE_BREAK"] * 5] * 2
        else:
            sentence_words.append(_line.split("\t"))

    str_data = pd.DataFrame(_data)

    print("Generating baseline features")
    numeric_data = get_other_baseline_features(baseline_filename, use_cache=True)
    relation_feature = numeric_data["relation_feature"]
    del numeric_data["relation_feature"]

    print("Generating parse tree based path features")
    path_features = get_path_features(filename, parse_filename)
    assert path_features.shape[0] == str_data.shape[0]


    #print("Generating heuristic features")
    #heuristic_features = get_heuristic_features(filename, parse_filename)
    #assert heuristic_features.shape[0] == str_data.shape[0]


    str_data = pd.concat([str_data, relation_feature, path_features], axis=1)

    str_data = str_data[sorted(str_data.columns)]
    numeric_data = numeric_data[sorted(numeric_data.columns)]

    return str_data, numeric_data


In [31]:

str_data_o, numeric_data_o = get_data(train_filename, train_parse_filename, train_baseline_filename)

y_o = str_data_o["arg1"]
del str_data_o["arg1"]

Reading file
Generating basic fields
Generating baseline features
Generating parse tree based path features
0 sentences with no predicate


In [32]:

enc_o = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=2)
scaler_o = StandardScaler(with_mean=False)

In [33]:
X_o = scaler_o.fit_transform(enc_o.fit_transform(str_data_o))
X_o = hstack((X_o, csr_matrix(numeric_data_o.values)))
print(X_o.shape)
print(y_o.shape)

# clf = AdaBoostClassifier(random_state=0, n_estimators=100)
clf_o = LogisticRegression(random_state=0, max_iter=100)
clf_o.fit(X_o, y_o)

(61131, 49636)
(61131,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
def get_data_with_heuristics(filename, parse_filename, baseline_filename):
    _data = []

    print("Reading file")

    with open(filename) as infile:
        buf = infile.read().splitlines()

    prefix = ["prev2_", "prev_", "", "next_", "next2_", "next3_"]

    print("Generating basic fields")

    sentence_words = [["SENTENCE_BREAK"] * 5] * 2
    for _line in buf:
        if len(_line) == 0:
            sentence_words += [["SENTENCE_BREAK"] * 5] * 3
            stem_words = [get_stemmed_word(x[0]) for x in sentence_words]
            found_pred = "False"
            for i in range(0, len(sentence_words) - 5):
                tmp = {"arg1": 0}
                for j in range(6):
                    tmp[prefix[j] + "word"] = sentence_words[i + j][0]
                    tmp[prefix[j] + "stem"] = stem_words[i + j]
                    tmp[prefix[j] + "bio"] = sentence_words[i + j][2]
                    tmp[prefix[j] + "pos"] = sentence_words[i + j][1]
                    if len(sentence_words[i + j]) > 5 and sentence_words[i + j][5] != "ARG1":
                            tmp[prefix[j] + "rel"] = sentence_words[i + j][5]

                line_features = sentence_words[i + 2]
                tmp["token_number"] = line_features[3]
                tmp["sentence_number"] = line_features[4]
                if len(line_features) > 5:
                    if line_features[5] == "PRED":
                        found_pred = "True"
                    if line_features[5] == "ARG1":
                        tmp["arg1"] = 1
                tmp["right_to_pred"] = found_pred

                _data.append(tmp)

            sentence_words = [["SENTENCE_BREAK"] * 5] * 2
        else:
            sentence_words.append(_line.split("\t"))

    str_data = pd.DataFrame(_data)

    print("Generating baseline features")
    numeric_data = get_other_baseline_features(baseline_filename, use_cache=True)
    relation_feature = numeric_data["relation_feature"]
    del numeric_data["relation_feature"]

    print("Generating parse tree based path features")
    path_features = get_path_features(filename, parse_filename)
    assert path_features.shape[0] == str_data.shape[0]


    print("Generating heuristic features")
    heuristic_features = get_heuristic_features(filename, parse_filename)
    assert heuristic_features.shape[0] == str_data.shape[0]


    str_data = pd.concat([str_data, relation_feature, path_features, heuristic_features], axis=1)

    str_data = str_data[sorted(str_data.columns)]
    numeric_data = numeric_data[sorted(numeric_data.columns)]

    return str_data, numeric_data


In [35]:

str_data, numeric_data = get_data_with_heuristics(train_filename, train_parse_filename, train_baseline_filename)

y = str_data["arg1"]
del str_data["arg1"]

Reading file
Generating basic fields
Generating baseline features
Generating parse tree based path features
0 sentences with no predicate
Generating heuristic features


In [36]:

enc = OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=2)
scaler = StandardScaler(with_mean=False)

In [37]:
X = scaler.fit_transform(enc.fit_transform(str_data))
X = hstack((X, csr_matrix(numeric_data.values)))
print(X.shape)
print(y.shape)

# clf = AdaBoostClassifier(random_state=0, n_estimators=100)
clf_h = LogisticRegression(random_state=0, max_iter=100)
clf_h.fit(X, y)

(61131, 49641)
(61131,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
def apply_to_test_data_with_heuristics(in_filename, in_parse_filename, in_baseline_filename, out_filename):
    test_str_data, test_numeric_data = get_data_with_heuristics(in_filename, in_parse_filename, in_baseline_filename)

    del test_str_data["arg1"]

    # Align columns of dev set and training set
    test_numeric_data = pd.concat([test_numeric_data, pd.DataFrame(index=range(test_numeric_data.shape[0]), columns=list(set(numeric_data.columns).difference(set(test_numeric_data.columns)))).fillna(0)], axis=1)
    test_str_data = test_str_data[str_data.columns]
    test_numeric_data = test_numeric_data[numeric_data.columns]

    test_X = scaler.transform(enc.transform(test_str_data))
    test_X = hstack((test_X, csr_matrix(test_numeric_data.values)))
    print(test_X.shape)

    test_y = clf_h.predict(test_X)

    with open(in_filename) as f:
        buf = f.read().splitlines()
    with open(out_filename, "w") as outf:
        i = 0
        res = []
        for _line in buf:
            if len(_line) == 0:
                res.append("\n")
            else:
                res.append("\t".join(_line.split("\t")[:5] + (["ARG1"] if test_y[i] > 0 else [])) + "\n")

                i += 1

        outf.writelines(res)

In [39]:
# model with heuristic features, parse tree based path features and baseline features
# One hot encoding used
apply_to_test_data_with_heuristics(test_filename, test_parse_filename, test_baseline_filename, test_out_filename)

score_file_with_NNP_adjustment(test_filename, test_out_filename, "arg1")

Reading file
Generating basic fields
Generating baseline features
Generating parse tree based path features
0 sentences with no predicate
Generating heuristic features
(4278, 49641)
System [9, 62, 78, 103, 116, 154, 169, 195, 229, 257, 288, 320, 355, 380, 414, 421, 438, 462, 501, 523, 557, 581, 612, 637, 660, 722, 742, 756, 784, 834, 870, 956, 1013, 1057, 1096, 1142, 1168, 1169, 1170, 1171, 1277, 1298, 1313, 1328, 1345, 1432, 1457, 1482, 1490, 1547, 1592, 1615, 1642, 1664, 1675, 1693, 1739, 1783, 1810, 1828, 1852, 1863, 1891, 1941, 1947, 1972, 2161, 2254, 2279, 2333, 2400, 2401, 2414, 2458, 2527, 2563, 2573, 2611, 2633, 2690, 2732, 2762, 2780, 2893, 2932, 2959, 2989, 3052, 3086, 3116, 3136, 3165, 3248, 3262, 3292, 3321, 3356, 3376, 3403, 3427, 3457, 3479, 3513, 3515, 3566, 3591, 3647, 3673, 3734, 3780, 3813, 3814, 3815, 3816, 3817, 3837, 3838, 3839, 3840, 3841, 3848, 3849, 3850, 3851, 3864, 3865, 3923, 3974, 4022, 4025, 4054, 4072, 4094, 4130, 4160, 4206, 4237, 4275, 4308, 4330, 4340, 