### References:
1. https://web.stanford.edu/class/cs224u/2021/


In [1]:
import numpy as np
import os
import rel_ext_OLD
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from collections import Counter
import utils

In [2]:
corpus = rel_ext_OLD.Corpus('washington_post_test.tsv.gz')

In [3]:
kb = rel_ext_OLD.KB('Atsuko_filtered_KB.tsv.gz')

In [4]:
dataset = rel_ext_OLD.Dataset(corpus, kb)

In [5]:
splits = dataset.build_splits(
    split_names=['train', 'test'],
    split_fracs=[0.80, 0.20],
    seed=1)

In [6]:
splits

{'train': Corpus with 54,237 examples; KB with 22,313 triples,
 'test': Corpus with 14,543 examples; KB with 6,262 triples,
 'all': Corpus with 68,780 examples; KB with 28,575 triples}

In [7]:
splits['train'].count_examples()

                                             examples
relation               examples    triples    /triple
--------               --------    -------    -------
adjoins                     641       1283       0.50
capital                      90        406       0.22
contains                    271      14461       0.02
has_spouse                    4       2419       0.00
nationality                  14       1296       0.01
place_of_birth                0        874       0.00
place_of_death                4        676       0.01
worked_at                     1        898       0.00


In [8]:
splits['test'].count_examples()

                                             examples
relation               examples    triples    /triple
--------               --------    -------    -------
adjoins                     145        419       0.35
capital                      19        116       0.16
contains                     59       4220       0.01
has_spouse                    0        575       0.00
nationality                   1        302       0.00
place_of_birth                1        223       0.00
place_of_death                0        155       0.00
worked_at                     0        252       0.00


### GLOVE

In [11]:
glove_lookup = utils.glove2dict('glove.6B.300d.txt')

In [12]:
def glove_middle_featurizer(kbt, corpus, np_func=np.sum):
    reps = []
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split():
            rep = glove_lookup.get(word)
            if rep is not None:
                reps.append(rep)
    # A random representation of the right dimensionality if the
    # example happens not to overlap with GloVe's vocabulary:
    if len(reps) == 0:
        dim = len(next(iter(glove_lookup.values())))
        return utils.randvec(n=dim)
    else:
        return np_func(reps, axis=0)

In [13]:
model_factory = lambda: LogisticRegression(fit_intercept=True, solver='liblinear')

In [17]:
glove_results = rel_ext_OLD.experiment(
    splits,
    train_split='train',
    test_split='test',
    featurizers=[glove_middle_featurizer],
    vectorize=False,
    model_factory=model_factory,
    verbose=True)

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.174      0.010      0.039        419       2795
capital                   0.000      0.000      0.000        116       2492
contains                  0.766      0.993      0.803       4220       6596
has_spouse                0.345      0.158      0.279        575       2951
nationality               0.200      0.040      0.111        302       2678
place_of_birth            0.000      0.000      0.000        223       2599
place_of_death            0.100      0.006      0.026        155       2531
worked_at                 0.278      0.040      0.126        252       2628
------------------    ---------  ---------  ---------  ---------  ---------
macro-average             0.233      0.156      0.173       6262      25270
