In [1]:
import numpy as np
import pandas as pd
import json
import os
import string
import re
from nltk.corpus import stopwords
from data_cleaning import load_data 
from evaluate import load_type_hierarchy, get_type_path
from collections import Counter
from elasticsearch import Elasticsearch, helpers

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
INDEX_NAME = 'dbpedia_2'

In [4]:
es = Elasticsearch()
es.info()
train = load_data('datasets/DBpedia/smarttask_dbpedia_train.json')
#test = load_data('datasets/DBpedia/smarttask_dbpedia_test.json')
test = load_data('results/test_queries_svm_output.json')




In [5]:
stop_words = stopwords.words('english')

question_tags = ['who', 'what', 'when', 'where', 'which', 'whom', 'whose', 'why']
stop_words = [word for word in stop_words if word not in question_tags]

In [6]:
def preprocess_txt(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text).lower()
    text = re.sub(' +', ' ', text)
    word_list = [word for word in text.split() if word not in stop_words]
    text = " ".join(word_list)
    return text


def load_queries(docs):
    """
    Load queries from json file
    """

    resource_queries = {}
    count = 0
    for x in docs:
        if x['category'] != 'resource':
            count += 1
            continue
        
        if x['question'] is not None:
            q = preprocess_txt(x['question'])
        
            doc = {
                'question': q,
                'category': x['category'],
                'type': x['type']
            } 
            resource_queries.update({x['id']:doc})
        
    return resource_queries

In [7]:
training_queries = load_queries(train)
test_queries = load_queries(test)

In [12]:
def baseline_retrieval(query, field = 'abstract', index = INDEX_NAME):
    """
    Baseline retrieval using the inbuilt BM25 index from elastic search

    Arguments:
        index: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string)
    """
    hits = es.search(index=index, size=200,
                query = {"bool": {"must": {"match": {"abstract": query}}, "must_not": {"match": {"instance": "owl:Thing"}}}})['hits']['hits']
    hit_ids = [obj['_id'] for obj in hits]
    hit_types = [es.get(index=index, id=doc)["_source"].get("instance") for doc in hit_ids]
    result = [h[0] for h in Counter(hit_types).most_common(10)]
    
    return result

In [19]:
def es_BM25(es, data):
    results = {} 
    for query_id, query in data.items():
        if len(query['question'])>0:
            response = baseline_retrieval(query['question'],  
                                        field = 'abstract', index = INDEX_NAME)
            results.update({query_id:{
                                "id": query_id,
                                "category": query["category"],
                                "type": response
                                }
                            })
        else:
            continue
    return results
    

In [20]:
test_res = es_BM25(es, test_queries)



In [21]:
test_res

{'dbpedia_7955': {'id': 'dbpedia_7955',
  'category': 'resource',
  'type': ['dbo:Language',
   'dbo:Scientist',
   'dbo:Country',
   'dbo:ProgrammingLanguage',
   'dbo:Settlement',
   'dbo:Writer',
   'dbo:Person',
   'dbo:MilitaryConflict',
   'dbo:Island',
   'dbo:City']},
 'dbpedia_22599': {'id': 'dbpedia_22599',
  'category': 'resource',
  'type': ['dbo:MilitaryConflict',
   'dbo:MilitaryUnit',
   'dbo:Royalty',
   'dbo:Settlement',
   'dbo:Writer',
   'dbo:Person',
   'dbo:ArtificialSatellite',
   'dbo:City',
   'dbo:AdministrativeRegion',
   'dbo:OfficeHolder']},
 'dbpedia_19677': {'id': 'dbpedia_19677',
  'category': 'resource',
  'type': ['dbo:Settlement',
   'dbo:Country',
   'dbo:City',
   'dbo:AmericanFootballTeam',
   'dbo:PoliticalParty',
   'dbo:MilitaryUnit',
   'dbo:BaseballTeam',
   'dbo:AdministrativeRegion',
   'dbo:Person',
   'dbo:Scientist']},
 'dbpedia_11163': {'id': 'dbpedia_11163',
  'category': 'resource',
  'type': ['dbo:City',
   'dbo:Settlement',
   'dbo:P

In [26]:
test_res['dbpedia_7955']

{'id': 'dbpedia_7955',
 'category': 'resource',
 'type': ['dbo:Language',
  'dbo:Scientist',
  'dbo:Country',
  'dbo:ProgrammingLanguage',
  'dbo:Settlement',
  'dbo:Writer',
  'dbo:Person',
  'dbo:MilitaryConflict',
  'dbo:Island',
  'dbo:City']}

In [23]:
tt['dbpedia_16015']['type']

['number']

In [17]:
tt = {}
for x in test:
    tt.update({x['id']:{
                'id': x['id'],
                'category': x['category'],
                'type': x['type']}
            })

In [28]:
for key, value in tt.items():
    try:
        tt[key]['type'] = test_res[key]['type']
    except:
        continue

In [34]:
f = open(f"results/bm25_es_system_output.json", "w")
json.dump(list(tt.values()), f)
f.close()

In [35]:
from evaluate import load_ground_truth, load_system_output, evaluate

type_hierarchy, max_depth = load_type_hierarchy('evaluation/dbpedia/dbpedia_types.tsv')
ground_truth = load_ground_truth('datasets/DBpedia/smarttask_dbpedia_test.json', type_hierarchy)
system_output = load_system_output('results/bm25_es_system_output.json')
evaluate(system_output, ground_truth, type_hierarchy, max_depth)

Loading type hierarchy from evaluation/dbpedia/dbpedia_types.tsv... 761 types loaded (max depth: 7)
Loading ground truth from datasets/DBpedia/smarttask_dbpedia_test.json... 
   4369 questions loaded
Loading system predictions from results/bm25_es_system_output.json... 
   4369 predictions loaded


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.925
Type ranking (based on 4369 questions)
  NDCG@5:  0.527
  NDCG@10: 0.522


In [None]:
def evaluate_simple(es, k=100, amount=0, index=INDEX_NAME):
    """
    A test evaluation, simply re-rank using relevancy,
    0 = Not relevant
    1 = Partially relevant
    2 = Relevant
    
    Arguments:
        es: Elasticsearch object instance.
        k: How many documents to handle per query.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A dictionary containing the queryIDs - list of retrieved instance types.
    """
    progress, N = 0, len(training_queries)
    type_hierarchy, max_depth = load_type_hierarchy("evaluation\dbpedia\dbpedia_types.tsv")
    results = {}
    for qId, queryObject in training_queries.items():
        type_relevancy = {}
        for typ in queryObject['type']:
            if not typ in type_hierarchy:
                continue
            hierarchy = get_type_path(type_hierarchy, typ)[::-1]
            for v in hierarchy:
                type_relevancy[v] = 1 # Relevant, its in the same hierarchy but in a diff pos.            

        for typ in queryObject['type']:
            type_relevancy[typ] = 2 # This is the type we want. Give it the highest weight.
            
        if len(type_relevancy) == 0:
            continue

        query = queryObject['question']
        hits = es.search(index=index, _source=True, size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": query}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']

        rerank = []
        for obj in hits:
            instanceType = obj['_source']['instance']
            if not instanceType in type_hierarchy:
                rerank.append((instanceType, 0))
                continue
            if instanceType in type_relevancy:                
                rerank.append((instanceType, type_relevancy[instanceType]))
                continue                
            weight = get_type_path(type_hierarchy, instanceType)
            weight = [(1 if (t in type_relevancy) else 0) for t in weight] + [0]
            rerank.append((instanceType, max(weight)))

        rerank.sort(key=lambda x:x[-1], reverse=True) # Re-rank the initial hits based on their relevancy.
        results[qId] = [v for v,_ in rerank[:10]]

        progress += 1
        if (progress % 1000) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results