# EventKG - Extracting info for one event

The aim of the notebook is to automatically retrieve info for one event, and esp. the ground truth for EventKG.

Before running the notebook, ensure to have the followings:
* EventKG downloaded and preprocessed, cf. `eventkg-filtering.ipynb`
* Subset of EventKG loaded in [GraphDB](https://graphdb.ontotext.com)
* GraphDB endpoint active (Repositories name `eventkg`)

In [None]:
import io
import os
import json
import requests
import psutil

import pandas as pd
from settings import FOLDER_PATH

In [None]:
HEADERS = {
    "Accept": "text/csv"
}

DATASET_TO_START_URI = {
    "dbpedia": "http://dbpedia",
    "wikidata": "http://www.wikidata",
    "yago": "http://yago"
}
NB_CPUS = psutil.cpu_count(logical=False)

In [None]:
# <TO-DO: change if necessary>
ENDPOINT = "http://localhost:7200/repositories/eventkg"  # GraphDB endpoint
# FOLDER_SAVE_DATA = os.path.join(FOLDER_PATH, "data-ind")  # Folder to save the data
# EVENT = "http://dbpedia.org/resource/French_Revolution" # Event to extract info from
# DATASET = "dbpedia"  # Dataset, must be one of the keys of `DATASET_TO_START_URI`

FOLDER_SAVE_DATA = os.path.join(FOLDER_PATH, "data-test")  # Folder to save the data
EVENT = "http://www.wikidata.org/entity/Q47015896" # Event to extract info from
DATASET = "wikidata"  # Dataset, must be one of the keys of `DATASET_TO_START_URI`

In [None]:
# Creating folder if necessary
if not os.path.exists(FOLDER_SAVE_DATA):
    os.makedirs(FOLDER_SAVE_DATA)
if not os.path.exists(os.path.join(FOLDER_SAVE_DATA, DATASET)):
    os.makedirs(os.path.join(FOLDER_SAVE_DATA, DATASET))
for folder in ["config", "gs_events", "referents"]:
    path_folder = os.path.join(FOLDER_SAVE_DATA, DATASET, folder)
    if not os.path.exists(path_folder):
        os.makedirs(path_folder)

## 1. Retrieving info for the input event

* Ground truth events from EventKG 
* Referents (URI mapping)
* Start/End dates


### 1.1. Ground truth for each event

Ground truth = event part of that event in EventKG

In [None]:
QUERY_GROUND_TRUTH_TEMPLATE = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT(?subEventKG as ?linkDBpediaEn)
WHERE {
    
?event owl:sameAs <event-to-replace> .
?event sem:hasSubEvent* ?subEvent .
?subEvent owl:sameAs ?subEventKG .
    
?event sem:hasBeginTimeStamp ?startTimeEvent .
?event sem:hasEndTimeStamp ?endTimeEvent .

?subEvent sem:hasBeginTimeStamp ?startTimeSubEvent .
?subEvent sem:hasEndTimeStamp ?endTimeSubEvent .
    
FILTER( strStarts( str(?subEventKG), "<dataset-to-replace>" ) ) .
FILTER (?endTimeSubEvent >= ?startTimeEvent) .
FILTER (?startTimeSubEvent <= ?endTimeEvent) .
}
"""

QUERY_GROUND_TRUTH_TEMPLATE = \
    QUERY_GROUND_TRUTH_TEMPLATE.replace("<dataset-to-replace>", DATASET_TO_START_URI[DATASET])

In [None]:
query = QUERY_GROUND_TRUTH_TEMPLATE.replace(
    "event-to-replace", EVENT
)
response = requests.get(ENDPOINT, headers=HEADERS,
                        params={"query": query})
df_sub_event = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) 
df_sub_event.to_csv(os.path.join(FOLDER_SAVE_DATA, DATASET, "gs_events", f"{EVENT.split('/')[-1]}.csv"))
df_sub_event.head(3)
        

### 1.2. URI referents for each sub event

Due to differences in dataset version, URIs can vary over time, the aim of this section is to retrieve a unique ID referent for each set of URIs.


In [None]:
from src.get_equivalent_url import get_equivalent_url

In [None]:
get_equivalent_url(os.path.join(FOLDER_SAVE_DATA, DATASET, "gs_events", f"{EVENT.split('/')[-1]}.csv"),
                   os.path.join(FOLDER_SAVE_DATA, DATASET, "referents", f"{EVENT.split('/')[-1]}.json"))

### 2.3. Start and End dates of each event

Minimum start date among all start dates, maximum end date among all end dates.

Start date must be before end date.

In [None]:
QUERY_DATES_TEMPLATE = """
PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT (min(?startTimeEvent) as ?min) (max(?endTimeEvent) as ?max)
WHERE {
    
 ?event owl:sameAs <event-to-replace> .
 ?event sem:hasSubEvent* ?subEvent .
 ?event sem:hasBeginTimeStamp ?startTimeEvent .
 ?event sem:hasEndTimeStamp ?endTimeEvent .
 ?event owl:sameAs ?eventKG .

 FILTER( strStarts( str(?eventKG), "<dataset-to-replace>" ) ) .
}
GROUP BY ?eventKG
HAVING (max(?endTimeEvent) >= min(?startTimeEvent))
"""

QUERY_DATES_TEMPLATE = \
    QUERY_DATES_TEMPLATE.replace("<dataset-to-replace>", DATASET_TO_START_URI[DATASET])

In [None]:
def get_dates(event):
    query = QUERY_DATES_TEMPLATE.replace(
        "event-to-replace", event)
    response = requests.get(ENDPOINT, headers=HEADERS,
                        params={"query": query})
    return pd.read_csv(io.StringIO(response.content.decode('utf-8')))

In [None]:
result = get_dates(EVENT)
result

In [None]:
def store_changing_config(dico):
    name = EVENT.split("/")[-1]
    dico = {
        "start": EVENT,
        "start_date": result["min"].values[0],
        "end_date": result["max"].values[0],
        "gold_standard": os.path.join(FOLDER_SAVE_DATA, DATASET, "gs_events", f"{name}.csv"),
        "referents": os.path.join(FOLDER_SAVE_DATA, DATASET,  "referents", f"{name}.json"),
        "name_exp": name,
    }
    return dico

dico_config = store_changing_config(dico={})

In [None]:
dico_config

## 3. Prepare configuration files

In [None]:
with open(os.path.join(
    FOLDER_PATH, "configs-example", f"config-{DATASET}.json"), "r", encoding="utf-8") as openfile:
    BASE_CONFIG = json.load(openfile)

In [None]:
name = EVENT.split("/")[-1]
BASE_CONFIG.update(dico_config)
BASE_CONFIG["dataset_path"] = os.path.join(FOLDER_PATH, BASE_CONFIG["dataset_path"])
with open(os.path.join(FOLDER_SAVE_DATA, DATASET, "config", f"{name}.json"), "w", encoding='utf-8') as openfile:
    json.dump(BASE_CONFIG, openfile, indent=4)

## 4. Run the search

In [None]:
from datetime import datetime
from src.framework import GraphSearchFramework

In [None]:
json_path = os.path.join(FOLDER_SAVE_DATA, DATASET, "config", f"{name}.json")
with open(json_path, "r", encoding="utf-8") as openfile_main:
    config_loaded = json.load(openfile_main)
config_loaded["rdf_type"] = list(config_loaded["rdf_type"].items())

framework = GraphSearchFramework(config=config_loaded)
START = datetime.now()
print(f"Process started at {START}")
framework()
END = datetime.now()
print(f"Process ended at {END}, took {END-START}")