# CS 224U Final Project: Relation Extraction with Graph Embeddings

**Authors:** Ben Barnett and Aakash Pattabi

In [112]:
import numpy as np
import torch
import json
import sys
from pycorenlp import StanfordCoreNLP
from collections import defaultdict

### 1. Loading TACRED data

In [5]:
train_path = "tacred-relation/dataset/tacred/train.json"
eval_path = "tacred-relation/dataset/tacred/dev.json"
test_path = "tacred-relation/dataset/tacred/test.json"

with open(train_path, "rb") as f:
    train_data = json.load(f)
    
with open(eval_path, "rb") as f:
    eval_data = json.load(f)
    
with open(test_path, "rb") as f:
    test_data = json.load(f)

sanity = train_data[0]
print(sanity)

{'id': '61b3a65fb906688c92a1', 'relation': 'no_relation', 'token': ['Ali', 'lied', 'about', 'having', 'to', 'leave', 'for', 'her', 'job', 'to', 'see', 'if', 'Jake', 'would', 'end', 'the', 'show', 'to', 'be', 'with', 'her', '.'], 'subj_start': 20, 'subj_end': 20, 'obj_start': 12, 'obj_end': 12, 'subj_type': 'PERSON', 'obj_type': 'PERSON', 'stanford_pos': ['NNP', 'VBD', 'IN', 'VBG', 'TO', 'VB', 'IN', 'PRP$', 'NN', 'TO', 'VB', 'IN', 'NNP', 'MD', 'VB', 'DT', 'NN', 'TO', 'VB', 'IN', 'PRP', '.'], 'stanford_ner': ['PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'stanford_head': ['2', '0', '4', '2', '6', '4', '11', '9', '11', '11', '6', '15', '15', '15', '11', '17', '15', '21', '21', '21', '17', '2'], 'stanford_deprel': ['nsubj', 'ROOT', 'mark', 'advcl', 'mark', 'xcomp', 'mark', 'nmod:poss', 'nsubj', 'mark', 'advcl', 'mark', 'nsubj', 'aux', 'advcl', 'det', 'dobj', 'mark', 'cop', 'case', 'acl', 'punct']}


### 2. Defining a sentence-level feature extractor

In [137]:
class GraphFeatureExtractor(object):
    def __init__(self, server, parse_level = "basic", 
                 embedding_path = None, 
                 save_path = None):
        self.server = server
        self.set_parse_level(parse_level)
        
        # Initialize word embeddings
        if embedding_path:
            self.embedding_path = embedding_path
            self._load_embeddings()
        else:
            self.embedding_path = None
    
    def _load_embeddings(self):
        self.embeddings = {}
        with open(self.embedding_path, "r") as f:
            for line in f:
                tokens = line.split()
                self.embeddings[tokens[0]] = [float(i) for i in tokens[1:]]
            
    def set_parse_level(self, parse_level):
        assert parse_level in ["basic", "enhanced", "extra_enhanced"]
        d = {
            "basic" : "basicDependencies",
            "enhanced" : "enhancedDependencies", 
            "extra_enhanced" : "enhancedPlusPlusDependencies"
        }
        self.parse_level = d[parse_level]
    
    def extract_graph(self, sentence):
        Y = sentence["relation"]
        
        # Extract tokens, subsentence (b/w subj->obj tokens)
        tokens = sentence["token"]
        first = min(sentence["subj_end"], sentence["obj_end"])
        second = max(sentence["subj_start"], sentence["obj_start"])
        middle = tokens[first+1:second]

        # Concatenate full sentence and sentence middle (b/w subj->obj tokens)
        full_sentence = " ".join(tokens)
        full_middle = " ".join(middle)
        
        # Parse with Stanford parser
        full_sentence_out = server.annotate(full_sentence, properties = {
            "annotators" : "parse", 
            "outputFormat" : "json"
        })
        middle_out = server.annotate(full_middle, properties = {
            "annotators" : "parse", 
            "outputFormat" : "json"
        })
        
        # Extract graph edgelist
        X_full = self._parse_to_graph(full_sentence_out)
        X_middle = self._parse_to_graph(middle_out)
        
        # Add word-level GloVe features to graph inputs
        if self.embedding_path:
            X_full["features"] = self._get_embedding_features(tokens)
            X_middle["features"] = self._get_embedding_features(middle)
        
        return {"full" : X_full, "middle" : X_middle, "Y" : Y}
        
    def _parse_to_graph(self, parse):
        dep_list = parse["sentences"][0][self.parse_level]
        dep_graph = defaultdict(lambda : [])
        for d in dep_list:
            dep_graph[d["governor"]].append(d["dependent"])
        return self._convert_to_edgelist(dep_graph)
    
    def _convert_to_edgelist(self, dep_graph):
        el = {
            "edges" : [[k, vi] for k, v in dep_graph.items() for vi in v] 
        }        
        return el
    
    def _get_embedding_features(self, sent, embedding_dim = 50):
        feats = {}
        for i, token in enumerate(sent):
            features = self.embeddings.get(token, None)
            if not features:
                features = [np.random.rand() for j in range(embedding_dim)]
            feats[i+1] = features
            
        # By default, we assign the [ROOT] token in the parse tree an embedding vector
        # of all zeroes... mostly because I'm not strictly sure what else to do here. @Ben, thoughts?
        feats[0] = [0]*embedding_dim
        return feats
    
    def extract_batch_graphs(self, sentences):
        return [self.extract_graph(s) for s in sentences]
    
    def save_jsons(self, graphs, save_path, postfix = ""):
        assert save_path and save_path[-1] == "/"
        for i, g in enumerate(graphs):
            with open(save_path + str(i) + postfix + ".json", "w") as f:
                json.dump(g, f)

Below, we generate all graph features (with 50-dimensional GloVe) vectors for the TACRED training set. Each sentence-level graph (over the entire sentence and over only the "bridge" words between the subject and the object) is saved to a .json file which we then post-process with Graph2Vec. 

In [138]:
embedding_path = "./glove/glove.6B.50d.txt"
server = StanfordCoreNLP("http://localhost:9000")

fe = GraphFeatureExtractor(server, embedding_path = embedding_path)
feats = fe.extract_batch_graphs(train_data)

full_save_path = "./train_features_full/"
fe.save_jsons([feat["full"] for feat in feats], postfix = "", save_path = full_save_path)

middle_save_path = "./train_features_middle/"
fe.save_jsons([feat["middle"] for feat in feats], postfix = "", save_path = middle_save_path)

In [134]:
X_full = [feat["full"] for feat in feats]
embs = X_full[7]["features"]
print(embs.keys())
print(X_full[7]["edges"])
print(train_data[7]["token"])

dp = server.annotate(" ".join(train_data[0]["token"]), properties = {
            "annotators" : "parse", 
            "outputFormat" : "json"
        })
dpd = dp["sentences"][0]["basicDependencies"]
print(dpd)

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
[[0, 3], [3, 1], [3, 2], [3, 10], [3, 25], [10, 4], [10, 13], [4, 5], [4, 6], [4, 7], [4, 8], [4, 9], [13, 11], [13, 12], [13, 17], [17, 14], [17, 16], [17, 19], [17, 24], [16, 15], [19, 18], [24, 20], [24, 21], [24, 22], [24, 23]]
['So', 'what', 'do', 'Fisher', ',', 'Kiffin', ',', 'and', 'Marciano', 'see', 'unfolding', 'Saturday', 'night', 'when', 'the', 'Jaguars', 'visit', 'the', 'Patriots', 'in', 'an', 'AFC', 'playoff', 'matchup', '?']
[{'dep': 'ROOT', 'governor': 0, 'governorGloss': 'ROOT', 'dependent': 2, 'dependentGloss': 'lied'}, {'dep': 'nsubj', 'governor': 2, 'governorGloss': 'lied', 'dependent': 1, 'dependentGloss': 'Ali'}, {'dep': 'mark', 'governor': 4, 'governorGloss': 'having', 'dependent': 3, 'dependentGloss': 'about'}, {'dep': 'advcl', 'governor': 2, 'governorGloss': 'lied', 'dependent': 4, 'dependentGloss': 'having'}, {'dep': 'mark', 'governor': 6, 'governorGloss': 'lea

### 3. Defining a test harness

### 4. Evaluating the model and comparing to baselines