## Elasticsearch indexing

- Run this notebook to create the Elasticsearch index


In [1]:
import traceback
import elasticsearch
import time
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch()
es.info()



{'name': 'BRUNAPC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': '0_kvMIaGRvKEqdk-L5rqcA',
 'version': {'number': '7.17.6',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'f65e9d338dc1d07b642e14a27f338990148ee5b6',
  'build_date': '2022-08-23T11:08:48.893373482Z',
  'build_snapshot': False,
  'lucene_version': '8.11.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [2]:
FIELDS = ['abstract', 'instance']
INDEX_NAME = 'dbpedia'
INDEX_SETTINGS = {
'settings': {
        'index': {
            'number_of_shards': 1,
            'number_of_replicas': 1,

            # configure our default similarity algorithm explicitly to use bm25,
            # this allows it to use it for all the fields
            'similarity': {
                'default': {
                    'type': 'BM25'
                }
            }
        }
    },
    'mappings': {
            'properties': {
                'abstract': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'instance': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                }
            }
        }
    }

In [3]:
def es_create():
    if es.indices.exists(INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)    
    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

es_create()

  
  after removing the cwd from sys.path.


## Data exploration 

- how the files look like

In [4]:
def ttl_reader(filename, size, enc='utf-8'):
    """
    Print the first lines of a ttl file.
    """
    if size <= 0:
        print("Size must be greater than zero!")
        return

    with open(filename, encoding=enc) as f:
        for i,line in enumerate(f):
            if (size >= 0) and (i >= size+1):
                break
            if i == 0: # Skip top line.
                continue
            print(line.strip())

In [5]:
ttl_reader("mappings/short_abstracts_en.ttl",5)

<http://dbpedia.org/resource/Animalia_(book)> <http://www.w3.org/2000/01/rdf-schema#comment> "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."@en .
<http://dbpedia.org/resource/Actrius> <http://www.w3.org/2000/01/rdf-schema#comment> "Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996."@en .
<http://dbpedia.org/resource/Alain_Connes> <http://www.w3.org/2000/01/rdf-schema#comment> "Alain Connes (French: [alɛ̃ kɔn]; born 1 April 1947) is a French mathematician, currently Profess

In [6]:
#ttl_reader("mappings/short_abstracts_en.ttl",5)
ttl_reader("mappings/instance_types_en.ttl", 5)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Achilles> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Autism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Disease> .
<http://dbpedia.org/resource/Alabama> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/AdministrativeRegion> .
<http://dbpedia.org/resource/Abraham_Lincoln> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/OfficeHolder> .


## Data cleaning

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
question_tags = ['who', 'what', 'when', 'where', 'which', 'whom', 'whose', 'why']
stop_words = [word for word in stop_words if word not in question_tags]

In [8]:
import re
def preprocess_txt(text):
    
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text).lower()
    text = re.sub(' +', ' ', text)
    stop_words = stopwords.words('english')
    word_list = [word for word in text.split() if word not in stop_words]
    text = " ".join(word_list)
    return text


def preprocess_abstracts(text):
    page_list = re.findall('<.*?>', text)

    try:
        abstract = re.findall('\".*?\"', text)[0]
        abstract = preprocess_txt(abstract)
        entity= page_list[0].split('/')[-1]  #entity
        entity= entity[:-1].replace('_', ' ')
    except:
        abstract = ''
        entity = ''

    return entity, abstract
   

def preprocess_entity_type(text):
    page_list = re.findall('<.*?>', text)

    try: 
        entity = page_list[0].split('/')[-1][:-1].replace('_', ' ')
        entity_type = page_list[-1].split('/')[-1][:-1].replace('owl#', '').replace('_', ' ')
        if entity_type=='Thing':
            entity_type='owl:Thing'
        else:
            entity_type='dbo:'+entity_type
    except:
        entity = ''
        entity_type = ''

    return entity, entity_type


def preprocess_categories(text):
    page_list = re.findall('<.*?>', text) 
    entity = page_list[0].split('/')[-1][:-1]
    category = page_list[-1].split('/')[-1][:-1]
    category = category.replace('Category:','').replace('_', '')
    return entity.replace('_', ' '), category
    

Declaring parsing functions 

*NB! Comment out lines 6 and 7 + lines 28 and 29 to parse only a subset of the corpus.*

In [9]:
size = 30000
def parse_abstracts(data, return_keys=False):   

    with open(file="mappings\short_abstracts_en.ttl", encoding='utf-8') as f:
        for i,line in enumerate(f):
            # if (size >= 0) and (i >= size+1):
            #     break
            if i == 0: # Skip top line.
                continue
            entity, abstract = preprocess_abstracts(line)
            
            #only abstracts that are not empty
            if len(abstract)>0 and len(entity)>0:
                data.update({
                        entity:{ 
                        "_id": entity,
                            "abstract": abstract,
                            "instance":''
                        }
                        })
    if return_keys:                    
        return list(data.keys())

def parse_entity_type(data):
    entity_list = []
    with open(file="mappings\instance_types_en.ttl", encoding='utf-8') as f:
        for i,line in enumerate(f):
            # if (size >= 0) and (i >= size+1):
            #     break
            if i == 0: # Skip top line.
                continue
            entity, entity_type = preprocess_entity_type(line)

            if len(entity_type)>0 and len(entity)>0:
                entity_list.append(entity)
                try:
                    previous_entity = data[entity]['instance']
                    data[entity]['instance']=entity_type #+ previous_entity
                except:
                    pass
            


In [10]:
data = {}

parse_abstracts(data)
parse_entity_type(data)

In [11]:
# Printing out some results:
i=0
for k, v in data.items():
    print('{}:{}'.format(k, v))
    i +=1
    if i == 5:
        break

Animalia (book):{'_id': 'Animalia (book)', 'abstract': 'animalia illustrated children book graeme base originally published 1986 followed tenth anniversary edition 1996 25th anniversary edition 2012 three million copies sold special numbered signed anniversary edition also published 1996 embossed gold jacket', 'instance': 'dbo:Book'}
Actrius:{'_id': 'Actrius', 'abstract': 'actresses catalan actrius 1997 catalan language spanish drama film produced directed ventura pons based award winning stage play e r josep maria benet jornet film male actors roles played females film produced 1996', 'instance': 'dbo:Film'}
Alain Connes:{'_id': 'Alain Connes', 'abstract': 'alain connes french al k n born 1 april 1947 french mathematician currently professor coll ge de france ih ohio state university vanderbilt university invited professor conservatoire national des arts et tiers 2000', 'instance': 'dbo:Scientist'}
An American in Paris:{'_id': 'An American in Paris', 'abstract': 'american paris jazz i

In [14]:
def del_no_type_entity(data):
    """Delete dictionary items with no instance type"""

    print("Amount of data before deletion: ", len(data))
    items = []
    for k,v in data.items():
        if len(data[k]["instance"])==0:
            items.append(k)
    for i in items:
        data.pop(i)
    print("Amount of data after deletion: ", len(data))

In [13]:
del_no_type_entity(data)

Amount of data before deletion:  4855261
Amount of data after deletion:  3253241


## Indexing

In [15]:
batch_size = 20000
doc_list = list(data.values())
for i in range(0, len(data), batch_size):
    actions = [{
            "_index": INDEX_NAME,
            "_id": x["_id"],
            "_source": {
                "abstract": x["abstract"],
                "instance": x["instance"]
            }
        } for x in doc_list[i:i+batch_size]]
    helpers.bulk(es, actions, index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)




## Testing the index

In [16]:
search_param={"match": {"instance": 'dbo:Book'}}

In [17]:
response = es.search(index=INDEX_NAME, query=search_param)
print('Files matched', response['hits']['total']['value'])

Files matched 10000


In [18]:
response['hits']['hits'][0]['_source']['instance']

'dbo:Book'