# Query complementary information

In [1]:
import json
import os
import pandas as pd
from _datetime import date
from datetime import datetime
import time
from operator import itemgetter
from pandas import json_normalize 
import numpy as np
import sys

# wikidata api 
from SPARQLWrapper import SPARQLWrapper, JSON
# from qwikidata.sparql import return_sparql_query_results

# For DBpedia spotlight, PPE entities
import requests
import pycurl
from urllib.request import urlopen
from urllib.parse import quote

#### Process DBpedial Spotlight entity annotation: functions

In [2]:
# using DBPedia spotlight to search for entities
# URL for local installation of DBpedia spotlight
urlAnnotation = 'http://dbpedia-spotlight.en:80/rest/annotate/'
# urlAnnotation = 'http://dbpedia-spotlight.en:2222/rest/annotate/' ## not working

# if using KMi sparql endpoint
# urlAnnotation = 'https://whise.kmi.open.ac.uk/rest/annotate'

# setting headers and parameters not using DBpedia categories
def setDbPediaAnnotationServiceParameters(text):
    """ Se parameters for querying Dbpedia
    args: text - text to be analysed
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'confidence': '0.4',
        "text":text,
    }
    return headers, params

# setting headers and parameters, filtering by categories
def setDbPediaAnnotationServiceParametersTypes(text,types):
    """ Se parameters for querying Dbpedia
    args: text - text to be annotated
        types: different categories of entities
    return: headers
            params
    """
    headers = {
        'Accept':'application/json',
        "content-type":"application/x-www-form-urlencoded"
    }

    params = {
        'types' : types,
        'confidence': '0.35',
        "text":text,
    }
    return headers, params

# return response in JSON format
def queryDBPediaAnnotation(url,header,params):
    try:
        response = requests.post(url,headers=header, params=params).json()
        
    except Exception as ex:
        if hasattr(ex, 'message'):
            print(ex.message)
        else:
            print(ex)
        raise Exception(ex)
    # finally:
        # print(response)

    return response

def executeQueryDbpedia(q, f='application/json'):
    epr = "http://dbpedia.org/sparql"
    try:
        params = {'query': q}
        resp = requests.get(epr, params=params, headers={'Accept': f})
    #    return resp.text
        return resp
    except Exception as e:
        print(e, file=sys.stdout)
        raise

def queryGeoLocationInformation(uri):
    values = []
    variables = []
#     SELECT * WHERE {
#     VALUES ?s { <http://dbpedia.org/resource/Finland> }

#     OPTIONAL { ?s <http://www.w3.org/2003/01/geo/wgs84_pos#lat> ?lat } .
#     OPTIONAL { ?s <http://www.w3.org/2003/01/geo/wgs84_pos#long> ?long } .
#     OPTIONAL { ?s rdfs:label ?label .  FILTER langMatches( lang(?label), "EN" )  } .
#      OPTIONAL { ?s dbo:thumbnail ?image } .
#     }
    query_text = """SELECT * WHERE { VALUES ?s { <"""+uri+"""> } OPTIONAL { ?s <http://www.w3.org/2003/01/geo/wgs84_pos#lat> ?lat } . 
    OPTIONAL { ?s <http://www.w3.org/2003/01/geo/wgs84_pos#long> ?long } . OPTIONAL { ?s rdfs:label ?label .  FILTER langMatches( lang(?label),"+'"EN"'+" )  } . 
    OPTIONAL { ?s dbo:thumbnail ?image } . }"""
    # print(query_text)
    # {'head': {'link': [], 'vars': ['s', 'lat', 'long', 'label', 'image']}, 'results': 
    #  {'distinct': False, 'ordered': True, 'bindings': 
    #   [{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/London'}, 
    #     'lat': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#float', 'value': '51.5072'}, 
    #     'long': {'type': 'typed-literal', 'datatype': 'http://www.w3.org/2001/XMLSchema#float', 'value': '-0.1275'}, 
    #     'label': {'type': 'literal', 'xml:lang': 'en', 'value': 'London'}, 
    #     'image': {'type': 'uri', 'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/London_Montage_L.jpg?width=300'}}]}}
    results = executeQueryDbpedia(query_text).json()
    # print(results)
    # if query returns a response
    if len(results['results']['bindings']) == 0:
        print("No entity", uri)
    else:
        # return a list, with [lat, long]
        result_list = list(results['results']['bindings'])[0]
        variables = list(results['head']['vars'])
        for label in variables:
            if label in result_list:
                values.append(result_list[label]['value'])
            else:
                values.append(np.NaN)
    return variables,values

def queryBiographyInformation(uri):
    values = []
    variables = []
    # SELECT * WHERE {
    # VALUES ?s { <http://dbpedia.org/resource/Louis_Jordan> }
    # OPTIONAL { ?s rdfs:label ?label .  FILTER langMatches( lang(?label), "EN" )  } .
    #  OPTIONAL { ?s dbo:thumbnail ?image } .
    # OPTIONAL { ?s dbp:birthDate ?dob } .
    # OPTIONAL { ?s dbp:deathDate ?dod } .
    # OPTIONAL { ?s dbo:wikiPageID ?wikiId } .
    # OPTIONAL { ?s dbo:abstract ?abstract .  FILTER langMatches( lang(?abstract), "EN" ) } .
    # }
    query_text = """SELECT * WHERE { VALUES ?s { <"""+uri+"""> } OPTIONAL { ?s rdfs:label ?label .  FILTER langMatches( lang(?label), "EN" )  } . 
    OPTIONAL { ?s dbo:thumbnail ?image } . OPTIONAL { ?s dbp:birthDate ?birthdate } . OPTIONAL { ?s dbp:deathDate ?deathdate } . 
    OPTIONAL { ?s owl:sameAs ?wikidataId . FILTER regex (?wikidataId, "http://www.wikidata.org/entity/" ) }
    OPTIONAL { ?s dbo:abstract ?abstract .  FILTER langMatches( lang(?abstract), "EN" ) } . }"""
    # print(query_text)
    results = executeQueryDbpedia(query_text).json()
    # print(results)
    # if query returns a response
    if len(results['results']['bindings']) == 0:
        print("No entity", uri)
    else:
        # return a list, with [lat, long]
        result_list = list(results['results']['bindings'])[0]
        variables = list(results['head']['vars'])
        for label in variables:
            if label in result_list:
                if (label=='birthdate' or label=='deathdate'):
                    if evaluateDateFormat(result_list[label]['value']):
                        values.append(result_list[label]['value'])
                    else:
                        values.append(np.NaN)
                else:
                    values.append(result_list[label]['value'])
            else:
                values.append(np.NaN)
    return variables,values
   
def setWikidateParams(query):
    endpoint_url = "https://query.wikidata.org/sparql"
    user_agent = "Polifonia-MEETUPSPilot0.1 (https://polifonia-project.eu/pilots/meetups/; alba.morales-tirado@open.ac.uk) /%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def queryWikidataEndpoint(uri):
    values = []
    variables = []
    # """SELECT * WHERE { <http://www.wikidata.org/entity/Q212692> wdt:P569 ?birthdate ; 
    # OPTIONAL { <http://www.wikidata.org/entity/Q212692> wdt:P570 ?deathdate } . }"""
    sparql_query = """SELECT * WHERE { <"""+uri+"""> wdt:P569 ?birthdate ; 
    OPTIONAL { <"""+uri+"""> wdt:P570 ?deathdate } . }"""
    # print(sparql_query)
    # results = return_sparql_query_results(sparql_query)
    results = setWikidateParams(sparql_query)
    # print(results)
    
    if len(results['results']['bindings']) == 0:
        print("No entity", uri)
    else:
        # return a list, with [lat, long]
        result_list = list(results['results']['bindings'])[0]
        variables = list(results['head']['vars'])
        for label in variables:
            if label in result_list:
                # values.append(result_list[label]['value'])
                values.append(result_list[label]['value'].split("T",1)[0])
                # dic['birthdate'].split("T",1)[0]
            else:
                values.append(np.NaN)
    return variables,values

def evaluateDateFormat(dateValue):
    # initializing format
    format = "%Y-%m-%d"
    # checking if format matches the date
    res = True
    # using try-except to check for truth value
    try:
        res = bool(datetime.strptime(dateValue, format))
    except ValueError:
        res = False
    return res

In [3]:
labels,values = queryBiographyInformation("http://dbpedia.org/resource/Anton_Rubinstein")
# values, labels = queryBiographyInformation("http://dbpedia.org/resource/Edward_Elgar")

dictionary = dict(zip(labels,values))
print(dictionary)
lab, val = queryWikidataEndpoint("http://www.wikidata.org/entity/Q5703377")
dic = dict(zip(lab, val))
print(dic)

{'s': 'http://dbpedia.org/resource/Anton_Rubinstein', 'label': 'Anton Rubinstein', 'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/rubinstein_repin.jpg?width=300', 'birthdate': nan, 'deathdate': nan, 'wikidataId': 'http://www.wikidata.org/entity/Q87567', 'abstract': "Anton Grigoryevich Rubinstein (Russian: Антон Григорьевич Рубинштейн, tr. Anton Grigor'evič Rubinštejn; 28 November [O.S. 16 November] 1829 – 20 November [O.S. 8 November] 1894) was a Russian pianist, composer and conductor who became a pivotal figure in Russian culture when he founded the Saint Petersburg Conservatory. He was the elder brother of Nikolai Rubinstein, who founded the Moscow Conservatory. As a pianist, Rubinstein ranks among the great 19th-century keyboard virtuosos. He became most famous for his series of historical recitals—seven enormous, consecutive concerts covering the history of piano music. Rubinstein played this series throughout Russia and Eastern Europe and in the United States when h

In [68]:
# print(dic['birthdate'])
# format_string = "%Y/%m/%dT%H:%M:%SZ"
# date_obj = datetime.strptime(dic['birthdate'], format_string).date()
# print(date_obj)
print(dic['birthdate'].split("T",1)[0])


 
# printing result
print("Does date match format? : " + str(res))

1931-01-01
The original string is : --11-28
Does date match format? : False


### 1. Information about places, geo-coordinates from DBpedia

In [97]:
# use chunk to load a small number of files in memory
for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=10):
    df_files = pd.DataFrame()
    df_files['file_name'] = chunk['file_name']

    # for each file/biography in the list, read its sentences 
    # and annotated the entities using the configured sparql endpoint
    for file_name in df_files.itertuples():
        print(file_name.file_name)
        # Read file with segmented sentences
        if os.path.isfile('geoLocationInformation/'+file_name.file_name):
            print("Already exists. Next.")
        else:
            if os.path.isfile('meetupsAnnotations/'+file_name.file_name):
                biography_df = pd.read_csv('meetupsAnnotations/'+file_name.file_name)
                entities_df = pd.DataFrame()
                # for each sentence in the biography
                for sentence_row in biography_df.itertuples():
                    # read people and places entities
                    meetupCandidatePP_df = pd.read_csv('extractedEntitiesPersonPlaceOnly/'+file_name.file_name.replace(".txt",".csv"))
                    # read coref results
                    # print(len(meetupCandidatePP_df))
                    # validate if coreference doc exist
                    if os.path.isfile('meetupsCorefOutputPP/'+file_name.file_name):
                        tempCoref_df = pd.read_csv('meetupsCorefOutputPP/'+file_name.file_name)
                        # concat values
                        temp_candidate = pd.concat([meetupCandidatePP_df, tempCoref_df], sort=False)
                    else:
                        print("No coref doc",file_name.file_name)
                        temp_candidate = meetupCandidatePP_df.copy()
                    temp_candidate = temp_candidate.query("paragraphIndex=={} & sentenceIndex=={} & entType == 'place'".format(sentence_row.paragraphIndex,
                                                                                                                 sentence_row.sentenceIndex))
                    
                    if temp_candidate.empty:
                        print("No place entities.",file_name.file_name)
                    else:
                        for place in temp_candidate.itertuples():
                            # return a list, with [lat, long]
                            labels,values = queryGeoLocationInformation(place.URI)
                            # print(labels,values)
                            if len(values) > 0:
                                row_dict = dict(zip(labels, values))
                                    
                                # print(dictionary)
                                # datarow = pd.Series(data={'URI':place.URI,'lat':coordinates[0],
                                                          # 'long':coordinates[1]})
                                datarow = pd.Series(data=row_dict)
                                entities_df = entities_df.append(datarow, ignore_index=True)
                if not entities_df.empty:
                    entities_df.drop_duplicates(subset=['s'],inplace=True)
                    entities_df.to_csv('geoLocationInformation/'+file_name.file_name,index=False)
                else:
                    print("No coordinates retrieved for biography. ",file_name.file_name)
            else:
                print("No meetups",file_name.file_name)
            
    print("waiting...")
    time.sleep(20)    

10085.csv
Already exists. Next.
9039.csv
Already exists. Next.
21511.csv
Already exists. Next.
45181.csv
Already exists. Next.
49644.csv
Already exists. Next.
50350.csv
Already exists. Next.
57520.csv
Already exists. Next.
99636.csv
Already exists. Next.
180714.csv
Already exists. Next.
312443.csv
Already exists. Next.
waiting...


KeyboardInterrupt: 

### 2. Biography subject information

In [4]:
# use chunk to load a small number of files in memory
for chunk in pd.read_csv('list_wikiIdSample.csv', chunksize=50):
    df_files = pd.DataFrame()
    df_files['file_name'] = chunk['file_name']

    # for each file/biography in the list, read its sentences 
    # and annotated the entities using the configured sparql endpoint
    for file_name in df_files.itertuples():
        print(file_name.file_name)
        # Read file with segmented sentences
        if os.path.isfile('peopleComplementaryInformation/'+file_name.file_name):
            print("Already exists. Next.")
        else:
            # if os.path.isfile('meetupsAnnotations/'+file_name.file_name):
            #     biography_df = pd.read_csv('meetupsAnnotations/'+file_name.file_name)
            entities_df = pd.DataFrame()
            # for each sentence in the biography
            # for sentence_row in biography_df.itertuples():
                # read people and places entities
            meetupCandidatePP_df = pd.read_csv('extractedEntitiesPersonPlaceOnly/'+file_name.file_name.replace(".txt",".csv"))
            # read coref results
            # print(len(meetupCandidatePP_df))
            # validate if coreference doc exist
            if os.path.isfile('meetupsCorefOutputPP/'+file_name.file_name):
                tempCoref_df = pd.read_csv('meetupsCorefOutputPP/'+file_name.file_name)
                # concat values
                temp_candidate = pd.concat([meetupCandidatePP_df, tempCoref_df], sort=False)
            else:
                print("No coref doc",file_name.file_name)
                temp_candidate = meetupCandidatePP_df.copy()
            temp_candidate = temp_candidate.query("entType == 'person'")
            temp_candidate.drop_duplicates(subset=['URI'],inplace=True)
            
            if temp_candidate.empty:
                print("No person entities.",file_name.file_name)
            else:
                print("Query entities",len(temp_candidate))
                count = 0
                for person in temp_candidate.itertuples():
                    # return a list, with [lat, long]
                    # print(person.URI)
                    labels,values = queryBiographyInformation(person.URI)
                    # print(labels,values)
                    if len(values) > 0:
                        row_dict = dict(zip(labels, values))
                        # if birth or death date empty query wikidata
                        if str(row_dict['birthdate']) == "nan" and str(row_dict['wikidataId']) != "nan":
                            # print(row_dict['wikidataId'])
                            lab, val = queryWikidataEndpoint(row_dict['wikidataId'])
                            row_dates = dict(zip(lab, val))
                            if len(val) > 0:
                                row_dict['birthdate'] = row_dates['birthdate']
                                row_dict['deathdate'] = row_dates['deathdate']

                        datarow = pd.Series(data=row_dict)
                        entities_df = entities_df.append(datarow, ignore_index=True)
                        count +=1
                    if divmod(count, 50)[1] == 0:
                        print("waiting entity...")
                        time.sleep(10)    
            if not entities_df.empty:
                entities_df.to_csv('peopleComplementaryInformation/'+file_name.file_name,index=False)
            else:
                print("No info retrieved for biography. ",file_name.file_name)
            # else:
            #     print("No people",file_name.file_name)
            
    print("waiting biography...")
    time.sleep(20)    

10085.csv
Already exists. Next.
9039.csv
Already exists. Next.
21511.csv
Already exists. Next.
45181.csv
Already exists. Next.
49644.csv
Already exists. Next.
50350.csv
Already exists. Next.
57520.csv
Already exists. Next.
99636.csv
Already exists. Next.
180714.csv
Already exists. Next.
312443.csv
Already exists. Next.
2444917.csv
Already exists. Next.
608845.csv
Already exists. Next.
1048151.csv
Already exists. Next.
2232977.csv
Already exists. Next.
1913885.csv
Already exists. Next.
1790990.csv
Already exists. Next.
409969.csv
Already exists. Next.
1709886.csv
Already exists. Next.
1551347.csv
Already exists. Next.
50782750.csv
Already exists. Next.
181946.csv
Already exists. Next.
579599.csv
Already exists. Next.
50963136.csv
Already exists. Next.
226142.csv
Already exists. Next.
1422240.csv
Already exists. Next.
1174545.csv
Already exists. Next.
70020.csv
Already exists. Next.
752694.csv
Already exists. Next.
312781.csv
Already exists. Next.
2898019.csv
Already exists. Next.
225302

In [47]:
entities_df.info()
entities_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   s           70 non-null     object
 1   label       70 non-null     object
 2   image       62 non-null     object
 3   birthdate   69 non-null     object
 4   deathdate   59 non-null     object
 5   wikidataId  69 non-null     object
 6   abstract    69 non-null     object
dtypes: object(7)
memory usage: 4.0+ KB


Unnamed: 0,s,label,image,birthdate,deathdate,wikidataId,abstract
0,http://dbpedia.org/resource/Saint_George,Saint George,http://commons.wikimedia.org/wiki/Special:File...,0275-01-01T00:00:00Z,0303-04-24T00:00:00Z,http://www.wikidata.org/entity/Q48438,"Saint George (Greek: Γεώργιος (Geórgios), Lati..."
1,http://dbpedia.org/resource/Johann_Nepomuk_Hummel,Johann Nepomuk Hummel,http://commons.wikimedia.org/wiki/Special:File...,1778-11-14,1837-10-17,http://www.wikidata.org/entity/Q151953,Johann Nepomuk Hummel (14 November 1778 – 17 O...
2,http://dbpedia.org/resource/Ludwig_van_Beethoven,Ludwig van Beethoven,http://commons.wikimedia.org/wiki/Special:File...,1770-12-16T00:00:00Z,1827-03-26T00:00:00Z,http://www.wikidata.org/entity/Q255,Ludwig van Beethoven (baptised 17 December 177...
3,http://dbpedia.org/resource/Madeleine_of_Valois,Madeleine of Valois,http://commons.wikimedia.org/wiki/Special:File...,1520-08-10,1537-07-07,http://www.wikidata.org/entity/Q231074,Madeleine of France or Madeleine of Valois (10...
4,http://dbpedia.org/resource/Robert_Schumann,Robert Schumann,http://commons.wikimedia.org/wiki/Special:File...,1810-06-08,1856-07-29,http://www.wikidata.org/entity/Q7351,Robert Schumann (German: [ˈʁoːbɛʁt ˈʃuːman]; 8...
