# Changing retrieval model parameters in Elasticsearch

This assignment uses the `aquaint` index (assuming you've already created that for Assignment 1). Alternatively, any other Elasticsearch index may be used.

In [1]:
from elasticsearch import Elasticsearch

In [2]:
INDEX_NAME = "aquaint"
DOC_TYPE = "doc"
FIELD = "content"

In [3]:
es = Elasticsearch()

In [4]:
QUERY_FILE = "data/queries.txt"
OUTPUT_FILE = "data/bm_optimized.txt"

### Small utility function for printing document rankings formatted

In [5]:
es = Elasticsearch()

In [6]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [7]:
queries = load_queries(QUERY_FILE)

### Changing BM25 parameters

Change the default similarity function

In [8]:
SIM = {
    "similarity": {
        "default": { 
            "type": "BM25",
            "b": 0.1,
            "k1": 0.8
        }
    }
}

A custom similarity can be updated by closing the index, updating the index settings, and reopening the index.

In [9]:
es.indices.close(index=INDEX_NAME)
es.indices.put_settings(index=INDEX_NAME, body=SIM)
es.indices.open(index=INDEX_NAME)

{'acknowledged': True}

You might need to wait a little bit before firing the first query. If you're getting errors from Elasticsearch, you can use the code below to wait 100ms.

In [10]:
from time import sleep
sleep(3)

Then run the query the same way as before

In [11]:
f = open(OUTPUT_FILE, 'w')
f.write('QueryId,DocumentId\n')  

for q_id, query in queries.items():
    res = es.search(index=INDEX_NAME, doc_type=DOC_TYPE, q=query, df="content", size=100,  filter_path=['hits.hits._id'])

    docIDs =[]  #  [x for x in res['hits']['hits']]
    
    for i in res['hits']['hits']:
        for j in i:
            docIDs.append(i[j])
            f.write(str(q_id) + "," + i[j]+'\n')

    f.write(str(q_id) + "," + " ".join(docIDs))
    f.write('\n')
f.close() 



You can also retrieve the current similarity settings.

In [12]:
es.indices.get_settings(index=INDEX_NAME)

{'aquaint': {'settings': {'index': {'creation_date': '1504521411837',
    'number_of_replicas': '1',
    'number_of_shards': '1',
    'provided_name': 'aquaint',
    'similarity': {'default': {'b': '0.1', 'k1': '0.8', 'type': 'BM25'}},
    'uuid': 'QeZeETRURrK7Jb61Zb-dcA',
    'version': {'created': '5050299'}}}}}

### Using a different retrieval model

Similarly to above, you may also change the retrieval model that is used. Elasticsearch implements, among others, language modeling (LM) and divergence from randomness (DFR).

See: https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html

Use Language Modeling with Jelinek-Mercer smoothing and with lambda=0.2. Check the retrieval scores change for the top-10 documents.

In [13]:
# SIM = {
#     "similarity": {
#         "default": {
#             "type": "LMJelinekMercer",
#             "lambda": 0.2
#         }
#     }
# }

In [14]:
# es.indices.close(index=INDEX_NAME)
# es.indices.put_settings(index=INDEX_NAME, body=SIM)
# es.indices.open(index=INDEX_NAME)

In [15]:
# sleep(0.1)

In [16]:
# res = es.search(index=INDEX_NAME, q=query, df=FIELD, _source=False, size=10).get("hits", {}).get("hits", {})

### Changing back to default similarity

**Important** you need to change back to the default similarity manually.

In [17]:
SIM = {
    "similarity": {
        "default": {
            "type": "BM25",
            "b": 0.75,
            "k1": 1.2
        }
    }
}

In [18]:
es.indices.close(index=INDEX_NAME)
es.indices.put_settings(index=INDEX_NAME, body=SIM)
es.indices.open(index=INDEX_NAME)

{'acknowledged': True}