In [1]:
import requests
import csv
import pandas as pd
import numpy as np
!pip install pyspotlight
import spotlight



In [2]:
def search_wikidata(entity):
    url = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search="+entity+"&language=en&format=json"
    response = requests.request("GET", url)
    try:
        res = response.json()['search'][0]
        return res['concepturi'], res['label']
    except:
        return '', ''

In [3]:
def search_dbpedia(entity):
    url = "http://lookup.dbpedia.org/api/search/PrefixSearch"
    querystring = {"MaxHits":"5","QueryString":entity}
    headers = {
        'Accept': "application/json",
        'cache-control': "no-cache",
        'Postman-Token': "c0a7c9ad-4277-408e-b5d6-fb03f9cbd526"
        }
    response = requests.request("GET", url, headers=headers, params=querystring)
    try:
        res = response.json()['results'][0]
        return res['uri'], res['label']
    except:
        return '', ''

In [4]:
def dbpedia_spotlight(organization):
    try:
        res = spotlight.annotate("https://api.dbpedia-spotlight.org/en/annotate", organization, confidence=0.4)[0]
        return res['URI']
    except:
        return ''

In [5]:
df = pd.read_csv('./standard.csv')
df.head()

Unnamed: 0,Portal Identifier,Snapshot Identifier,Dataset Identifier,Organization,Wikidata URI (latest),DBpedai URI (latest)
0,data_gov\n,1847,9602aa07-3eff-438b-9904-151cfd114685,California Natural Resource Agency,https://www.wikidata.org/entity/Q5020712,
1,www_daten_rlp_de,1641,95548f0f-890f-4f28-a841-b427b84256a0,Open-Government-Data-Portal Rheinland-Pfalz,https://www.wikidata.org/entity/Q63428138,
2,govdata_de,1638,9552e0f2-422c-4c1b-8184-07c5606a75e8,Transparenzportal Hamburg,https://www.wikidata.org/entity/Q59273672,
3,data_gov_au,1624,955ac416-cf9f-4f87-a46e-056f704f1567,Australian Institute of Marine Science,https://www.wikidata.org/entity/Q4824311,http://dbpedia.org/resource/Australian_Institu...
4,daten_rlp_de,1624,955aee5c-44ad-4b1c-a5d7-aceca7002fd7,Statistisches Landesamt Rheinland-Pfalz,https://www.wikidata.org/entity/Q2333954,http://dbpedia.org/resource/Statistisches_Land...


In [6]:
wiki = {}
pedia = {}
for org in df.Organization:
    wiki[org]=search_wikidata(org)[0]
    pedia[org]=search_dbpedia(org)[0]

In [7]:
y_true_wiki = []
y_pred_wiki = []
y_true_pedia = []
y_pred_pedia = []
for row in df.iterrows():
    org = row[1]['Organization']
    y_true_wiki.append(row[1]['Wikidata URI (latest)'][row[1]['Wikidata URI (latest)'].rfind('/'):])
    y_pred_wiki.append(wiki[org][wiki[org].rfind('/'):])
    y_true_pedia.append(row[1]['DBpedai URI (latest)'])
    y_pred_pedia.append(pedia[org])

In [8]:
from sklearn.metrics import precision_recall_fscore_support

In [9]:
precision_recall_fscore_support(y_true_wiki, y_pred_wiki, average='macro')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.6216216216216216, 0.5842342342342343, 0.5963963963963964, None)

In [10]:
precision_recall_fscore_support(y_true_pedia, y_pred_pedia, average='macro')

(0.3904109589041096, 0.3732876712328767, 0.3787671232876712, None)

In [11]:
y_pred = []
for row in df.iterrows():
    org = row[1]['Organization']
    pred = dbpedia_spotlight(org)
    y_pred.append(pred)# pred[pred.rfind('/'):]

In [12]:
precision_recall_fscore_support(y_true_pedia, y_pred, average='macro')

(0.21453900709219859, 0.22340425531914893, 0.21719858156028365, None)

In [20]:
def opentapioca(text):
    url = "https://opentapioca.org/api/annotate"

    payload = f"query={text.replace(' ', '%20')}"
    headers = {
        'content-type': "application/x-www-form-urlencoded",
        'accept': "application/json",
        'authorization': "Bearer: c3d567607acd70d022621918da649ed2b8b173dd"
        }
    
    response = requests.request("POST", url, data=payload.encode('utf-8'), headers=headers)
    try:
        return response.json()["annotations"][0]['tags'][0]['id']
    except Exception as e:
        return ''

In [26]:
y_pred2 = []
for row in df.iterrows():
    org = row[1]['Organization']
    pred = opentapioca(org)
    y_pred2.append(f'/{pred}')

In [27]:
precision_recall_fscore_support(y_true_wiki, y_pred2, average='macro')

(0.4329004329004329, 0.420995670995671, 0.42308802308802307, None)