The goal of this notebook is to generate a `.json` file input for the model from a text, to predict the relations present in that text. In Dathena pipeline, this should be implemented in `Scala` using the output of the NER tagger. 

In [1]:
import json
import hashlib
import os

import spacy

In [2]:
text = """
Bastien was born in New York.
Andrew works for Apple. inc since 2010.
Peter is the son of Robert. Peter was born in 1993.
Peter graduated from MIT in 2019. 
Peter and his brother Andrew are French.
"""

In [3]:
nlp = spacy.load('en')

In [4]:
def split_by_sentence(text):
    doc = nlp(text)
    return [str(sent) for sent in doc.sents]
sents = split_by_sentence(text)

In [5]:
def encode_hash(obj):
    return hashlib.sha224(str(obj).encode()).hexdigest()

In [6]:
id2ent = {}
def detect_entities(sent):
    json_obj = []
    doc = nlp(sent)
    ents = [ent for ent in doc.ents  if ent.text not in ['\n', ' ']]
    for ent in ents:
        for ent2 in ents:
            if ent != ent2:
                id2ent[str(encode_hash(ent))] = str(ent)
                id2ent[str(encode_hash(ent2))] = str(ent2)
                json_obj.append({'sentence': sent,
                                'head': {
                                    'word': str(ent),
                                    'id': str(encode_hash(ent))
                                } ,
                                 'tail': {
                                     'word': str(ent2),
                                     'id': str(encode_hash(ent2))
                                 },
                                'relation': 'NA'})
    return json_obj

In [7]:
json_obj = []
for sent in sents:
    json_obj += detect_entities(sent)

In [8]:
with open('id2ent.json', 'w') as f:
    f.write(json.dumps(id2ent))

In [9]:
directory = '../data/nyt'
if not os.path.exists(directory):
    os.mkdir(directory)
with open('../data/nyt/pred.json', 'w') as f:
    f.write(json.dumps(json_obj))