# Dbpedia Indexing

## Imports

In [1]:
import elasticsearch
import os
import re
import string
import time
from collections import deque
import traceback

# stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from tqdm import tqdm
from pprint import pprint

from elasticsearch import Elasticsearch, helpers, exceptions
from typing import Dict

# path variables, etc.
from config import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# debug mode
DEBUG = False

In [3]:
print("Elasticsearch version:", elasticsearch.__version__)
print("Index", INDEX_NAME)
print("Index settings:")
pprint(INDEX_SETTINGS)

print("Files to index:", SHORT_ABSTRACT_PATH, INSTANCE_TYPES_EN_PATH)

Elasticsearch version: (8, 5, 0)
Index smart_index
Index settings:
{'mappings': {'properties': {'abstract': {'analyzer': 'english',
                                          'term_vector': 'yes',
                                          'type': 'text'},
                             'instance_type': {'type': 'text'}}}}
Files to index: /mntnvme/datasets/DBpedia/short_abstracts_en.ttl /mntnvme/datasets/DBpedia/instance_types_en.ttl


## DbPedia Indexing class

In [9]:
class DbPediaCollection:
    def __init__(self, index_name: str, index_settings: Dict, stop_words=[], DEBUG=False, FILES=[]) -> None:
        self._index_name = index_name
        self._index_settings = index_settings
        self.es = Elasticsearch(hosts=["http://localhost:9200"])
        self.stop_words = stop_words
        self.FILES = FILES

        # for local dev
        self.DEBUG = DEBUG
    
    def preprocess(self, line, remove_stopwords=False):
        line = line.strip().lower().replace("_", " ").translate(str.maketrans('', '', string.punctuation))
        return " ".join([
            term 
            for term in re.sub(r"\s+", " ", line).split(" ") 
            if term not in self.stop_words
        ]).strip() if remove_stopwords else line

    def parse_instance_types(self, line):
        if line == None or line[0] == "#":
            return
        
        line = line.strip().replace('/>', '>').split(' ')
        if len(line) < 3:
            return
        entity = self.preprocess(line[0][1:-1].split("/")[-1]) # remove < and >, get entity + preprocess
        instance_type = self.preprocess(line[2][1:-1].split("/")[-1].replace("owl#", "")) # remove < and >, get instance type + preprocess
        return {
            "_id": entity,
            "doc" : {"instance_type": instance_type},
            "_op_type": "update"
        }
    
    def parse_abstracts(self, line):
        if line == None or line[0] == "#":
            return
        
        line = line.strip().replace('@en .', '') \
            .replace('"', '').replace('\\', '') \
            .replace('\'', '').replace('/>', '>').split(' ')

        if len(line) < 3:
            return
        entity = self.preprocess(line[0][1:-1].split("/")[-1])
        abstract = self.preprocess(' '.join(line[2:]), True)

        return {
            "_id": entity,
            "_source" : {"abstract": abstract, "instance_type": "_"}
        }
    
    def create_index(self, recreate_index=False):
        if self.es.indices.exists(index=self._index_name):
            if recreate_index:
                self.es.indices.delete(index=self._index_name)
        self.es.indices.create(index=self._index_name, body=self._index_settings)
    
    def query(self, body, size=10):
        try:
            start_time = time.time()
            res = self.es.search(index=self._index_name, body=body, size=size)
            print("Query time: {:4f} seconds".format(time.time() - start_time))
            return res
        except exceptions.RequestError as e:
            print(e)
            return None

    def index(self, bulk_size=1000, override_debug=False):
        try:
            data = {}
            start_time = time.time()
            for file in self.FILES:
                with open(file, "r", encoding="utf-8") as f:
                    entities = []
                    for i, line in enumerate(f):
                        # in debug, only index 100 entities
                        if self.DEBUG and i > 100:
                            break

                        if i == 0:
                            continue

                        if file == SHORT_ABSTRACT_PATH:
                            abstract = self.parse_abstracts(line)
                            if abstract is None: continue
                            data[abstract["_id"]] = abstract
                        #    entities.append(self.parse_abstracts(line))
                        elif file == INSTANCE_TYPES_EN_PATH:
                            types = self.parse_instance_types(line)
                            if types is None: continue
                            if types["_id"] in data:
                                data[types["_id"]]["_source"]["instance_type"] = types["doc"]["instance_type"]
                            # entities.append(self.parse_instance_types(line))
                        else:
                            print("Unknown file", file)
                            break
            print("Indexing begins...", "len of data", len(data))
            i = 0
            entities = []
            for entity, obj in tqdm(data.items()):
                if self.DEBUG and i < 5:
                    print("enity: ", entity, ", obj: ", obj, "\n")
                if self.DEBUG and i > 100:
                    break
                entities.append(obj)
                i += 1

            if len(entities) > 0:
                helpers.bulk(self.es, entities, index=self._index_name, raise_on_error=False)
                entities = []

        except Exception as e:
            print(e, traceback.format_exc())
        finally:
            print("Indexing finished, time elapsed: {:4f} seconds".format(time.time() - start_time))
            entities = []  # reset data
        
    
    
    # factory method for creating dbpedia collection
    @classmethod
    def create_dbpedia_collection(cls):
        return cls(INDEX_NAME, INDEX_SETTINGS, stop_words=stopwords.words('english'), DEBUG=DEBUG, FILES=[SHORT_ABSTRACT_PATH, INSTANCE_TYPES_EN_PATH])

## Run indexing

In [10]:
dbpedia_index = DbPediaCollection.create_dbpedia_collection()
dbpedia_index.create_index(recreate_index=True)

  if self.es.indices.exists(index=self._index_name):
  self.es.indices.delete(index=self._index_name)
  self.es.indices.create(index=self._index_name, body=self._index_settings)
  self.es.indices.create(index=self._index_name, body=self._index_settings)


In [11]:
dbpedia_index.index(bulk_size=1)

Indexing begins... len of data 4926000


100%|██████████| 4926000/4926000 [00:02<00:00, 2252690.30it/s]
  helpers.bulk(self.es, entities, index=self._index_name, raise_on_error=False)


Indexing finished, time elapsed: 1365.588029 seconds


In [12]:
print("Number of documents:", dbpedia_index.es.count(index=INDEX_NAME)["count"])

Number of documents: 4925999


  print("Number of documents:", dbpedia_index.es.count(index=INDEX_NAME)["count"])


In [13]:
dbpedia_index.es.get(index=INDEX_NAME, id="animalia book")

  dbpedia_index.es.get(index=INDEX_NAME, id="animalia book")


ObjectApiResponse({'_index': 'smart_index', '_type': '_doc', '_id': 'animalia book', '_version': 1, '_seq_no': 0, '_primary_term': 1, 'found': True, '_source': {'abstract': 'animalia illustrated childrens book graeme base originally published 1986 followed tenth anniversary edition 1996 25th anniversary edition 2012 three million copies sold special numbered signed anniversary edition also published 1996 embossed gold jacket', 'instance_type': 'book'}})

In [14]:
dbpedia_index.es.get(index=INDEX_NAME, id="actrius")

  dbpedia_index.es.get(index=INDEX_NAME, id="actrius")


ObjectApiResponse({'_index': 'smart_index', '_type': '_doc', '_id': 'actrius', '_version': 1, '_seq_no': 1, '_primary_term': 1, 'found': True, '_source': {'abstract': 'actresses catalan actrius 1997 catalan language spanish drama film produced directed ventura pons based awardwinning stage play er josep maria benet jornet film male actors roles played females film produced 1996', 'instance_type': 'film'}})