# Elasticsearch

A toy-sized example for indexing and searching a collection of documents.

In [1]:
from elasticsearch import Elasticsearch

In [2]:
import pprint  # for pretty printing of JSON objects

In [3]:
INDEX_NAME = "toy_index"  # the name of the index
DOC_TYPE = "doc"  # we have a single type of document, so it doesn't matter
INDEX_SETTINGS = {  # single shard with a single replica
    "settings" : {
        "index" : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        }
    }
}

The collection of documents is given here as a Python dictionary. Each document has two fields: title and content.

In [4]:
DOCS = {
    1: {"title": "Rap God",
        "content": "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings"
        },
    2: {"title": "Lose Yourself",
        "content": "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me"
        },
    3: {"title": "Love The Way You Lie",
        "content": "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts"
        },
    4: {"title": "The Monster",
        "content": ["gonna gonna I'm friends with the monster", "That's under my bed Get along with the voices inside of my head"]
        },
    5: {"title": "Beautiful",
        "content": "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone"
        }
}  # Eminem rulez ;)

### Create Elasticsearch object

In [5]:
es = Elasticsearch()

Check if service is running

In [6]:
es.info()

{'cluster_name': 'elasticsearch',
 'cluster_uuid': 'LMlf8WX9RPC0aJ0eB5R69Q',
 'name': 'Krisztians-MacBook-Pro.local',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2019-09-27T08:36:48.569419Z',
  'build_flavor': 'default',
  'build_hash': '22e1767283e61a198cb4db791ea66e3f11ab9910',
  'build_snapshot': False,
  'build_type': 'tar',
  'lucene_version': '8.2.0',
  'minimum_index_compatibility_version': '6.0.0-beta1',
  'minimum_wire_compatibility_version': '6.8.0',
  'number': '7.4.0'}}

### Create index

If the index exists, we delete it (normally, you don't want to do this).

In [7]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

We set the number of shards and replicas to be used for each index when it's created. (We use a single shard instead of the default 5.)

In [8]:
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

{'acknowledged': True, 'index': 'toy_index', 'shards_acknowledged': True}

### Add documents to the index

In [9]:
for doc_id, doc in DOCS.items():
    es.index(index=INDEX_NAME, doc_type=DOC_TYPE, id=doc_id, body=doc)

### Check what has been indexed

Get the contents of doc #3

In [10]:
doc = es.get(index=INDEX_NAME, doc_type=DOC_TYPE, id=3)

In [11]:
pprint.pprint(doc)

{'_id': '3',
 '_index': 'toy_index',
 '_primary_term': 1,
 '_seq_no': 2,
 '_source': {'content': "Just gonna stand there and watch me burn But that's "
                        'alright, because I like the way it hurts',
             'title': 'Love The Way You Lie'},
 '_type': 'doc',
 '_version': 1,
 'found': True}


Get the term vector for doc #3.

`termvectors` returns information and statistics on terms in the fields of a particular document.

In [12]:
tv = es.termvectors(index=INDEX_NAME, doc_type=DOC_TYPE, id=3, fields="title,content")

In [13]:
pprint.pprint(tv)

{'_id': '3',
 '_index': 'toy_index',
 '_type': 'doc',
 '_version': 1,
 'found': True,
 'term_vectors': {'content': {'field_statistics': {'doc_count': 5,
                                                   'sum_doc_freq': 91,
                                                   'sum_ttf': 104},
                              'terms': {'alright': {'term_freq': 1,
                                                    'tokens': [{'end_offset': 59,
                                                                'position': 10,
                                                                'start_offset': 52}]},
                                        'and': {'term_freq': 1,
                                                'tokens': [{'end_offset': 26,
                                                            'position': 4,
                                                            'start_offset': 23}]},
                                        'because': {'term_freq': 1,
                       

### Search

In [14]:
query = "rap monster"
res = es.search(index=INDEX_NAME, q=query, _source=False, size=10)

Print full response (`hits` holds the results)

In [15]:
pprint.pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 3}


Print only search results (ranked list of docs)

In [16]:
for hit in res['hits']['hits']:
    print("Doc ID: %3r  Score: %5.2f" % (hit['_id'], hit['_score']))