In [3]:
from converter import URIConverter
from math import log
import json
from utils import truncated_log, overlap
from candidate import Candidate
from diffbot_api import CachedQuery, EL_POL_ENTITY_TYPES


cd = None 

class BaselineLinker(object):
    def __init__(self, use_overlap = True, verbose = True):
        self._cq = CachedQuery()
        self._conv = URIConverter()
        self._use_overlap = use_overlap
        self._verbose = verbose
        
    def __del__(self):
        self.close()
        
    def close(self):
        try:
            self._cq.close()
            self._conv.close()
        except:
            print("Warning: trying to close a closed object.") 
    
    def _find_wiki_uri(self, uris):
        for uri in uris:
            if "wikipedia.org" in uri:
                return uri
        return "" 

    def _get_dbpedia_uri(self, wiki_uri, uris):
        dbpedia_uri = ""
        
        if wiki_uri != "":
            dbpedia_uri = self._conv.wikipedia2dbpedia(wiki_uri)
        else:
            for uri in uris:
                dbpedia_uri = self._conv.wikidata2dbpedia(uri)
                if dbpedia_uri != "": break

        return dbpedia_uri

    def _link_db_query(self, target, diffbot_query_response, use_overlap=True):
        candidates = []
        if "data" not in diffbot_query_response:
            return candidates
        else:
            data = diffbot_query_response["data"]

        for hit in data:
            uris = set(hit["allUris"])
            if "origin" in hit: uris.add( hit["origin"] )
            if "origins" in hit: uris.union( set(hit["origins"]) )
            if "wikipediaUri" in hit:
                uris.add( hit["wikipediaUri"] )

            if "importance" in hit:
                name = hit["name"]
                importance = float(hit["importance"])
                score = truncated_log(importance) * overlap(name, target) if use_overlap else importance
                wiki_uri = self._find_wiki_uri(uris)
                dbpedia_uri = self._get_dbpedia_uri(wiki_uri, uris)

                c = Candidate(score,
                              name,
                              dbpedia_uri,
                              wiki_uri,
                              hit["types"],
                              hit["allNames"],
                              uris)
                candidates.append(c)
            else:
                print("Warning: Skipping a hit without importance value.")

        global cd
        cd = candidates
        return sorted(candidates, reverse=True)

    def link(self, context, phrases):
        linked_phrases = []
        for phrase in phrases:
            candidates = []
            for entity_type in EL_POL_ENTITY_TYPES:
                r = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text))
                db_response = json.loads(r.content)
                candidates += self._link_db_query(phrase.text, db_response, use_overlap=self._use_overlap) 
            candidates = set(candidates)

            linked_phrases.append( (phrase, sorted(candidates, reverse=True)[0]) )
       
        if len(linked_phrases) != len(phrases):
            print("Warning: length of output is not equal to length of input {} != {}".format(len(best), len(phrases)))
        
        return linked_phrases
    
    def link_ttl(self, input_ttl):
        graph, context, phrases = parse_d2kb_ttl(input_ttl)

        print("# triples input:", len(graph))
        for linked_phrase in self.link(context, phrases):
            graph.add( (phrase.subj, CLASS_URI, NONE_URI) )
            graph.add( (phrase.subj, LINK_URI, NONE_URI) )
        
        print("# triples output:", len(graph))
        output_ttl = str(graph.serialize(format='n3', encoding="utf-8"), "utf-8")
        return output_ttl

In [5]:
from candidate import Phrase

phrases = "New York, windows, catwalk, teardrops, commuters, curved, floor, shape, video, walls, Grand Central Terminal"
phrases =  [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
                   for phrase in phrases.split(",")]
context = "Inside, it’s even wackier: curved walls, windows in the shape of teardrops, and a catwalk with a tiny video screen embedded in the floor that shows an endless loop of antlike commuters rushing through Grand Central Terminal in New York."

bl = BaselineLinker()

for phrase, candidate in bl.link(context, phrases):
    print(phrase.text, candidate.link)

New York 
windows 
catwalk 
teardrops 
commuters en.wikipedia.org/wiki/The_Commuters
curved en.wikipedia.org/wiki/Curved_Air
floor 
shape 
video en.wikipedia.org/wiki/Video_Ezy
walls en.wikipedia.org/wiki/Walls_(band)
Grand Central Terminal 


In [None]:
candidate

In [None]:
sorted(cd)
for c in cd:
    print(".", hash(c))
    

In [None]:
%load_ext autoreload
%autoreload 2

# DQL

In [None]:
query_and_save('allUris:"barackobama.com"', "data/all-uris.json")
query_and_save('wikipediaUri:"en.wikipedia.org/wiki/Barack_Obama"', "data/wiki-uri.json")
query_and_save('allUris:"en.wikipedia.org/wiki/Barack\_Obama"', "data/all-uris-wiki.json")
query_and_save('origins:"en.wikipedia.org/wiki/Barack_Obama"', "data/origins.json")

In [None]:
for entity_type in entity_types:
    query_and_save(
        query='type:{}'.format(entity_type),
        output_fpath="data/{}.json".format(entity_type))
    
query_and_save(
    query='type:Person name:"Alexander Panchenko"',
    output_fpath="data/ap.json")


query_and_save(
    query='type:Person employments.employer.name:"Diffbot"',
    output_fpath="data/diffbot-employees.json")


query_and_save(
    query='type:Person employments.{title:"CEO" employer.name:"Diffbot"}',
    output_fpath="data/diffbot-ceo.json")

query_and_save(
    query='type:Person employments.{employer.name:"Diffbot" isCurrent:true}',
    output_fpath="data/diffbot-current-employees.json")


# Testing type of links

In [None]:
query_and_save(
    query='type:Person name:"Angela Merkel"',
    output_fpath="data/am.json")

query_and_save(
    query='type:Person name:"Barack Obama"',
    output_fpath="data/bo.json")

query_and_save(
    query='type:Person name:"Nicolas Sarkozy"',
    output_fpath="data/ns.json")

query_and_save(
    query='type:Person name:"Diego Maradona"',
    output_fpath="data/dm.json")