# Parte II - Indexação dos dados e entendimento do cálculo de relevância

Neste notebook iremos:
* Criar um índice no Solr e Elasticsearch
* Indexar os dados de filmes do TMDB (https://www.themoviedb.org/) 
* Realizar uma pesquisa e entender o cálculo de relevância (chamado de explain no Lucene)
* Ajustar o índice para melhorar o cálculo de relevância

### Pré-requisitos:

Antes de executar este notebook, as engines de busca devem estar rodando:

* Para Solr: <b>solr start -e cloud</b>
* Para Elasticsearch: <b>elasticsearch</b>

Este notebook foi testado com Solr 8.2.0 e Elasticsearch 7.6.2

# Inicialização

In [327]:
import json
import requests
from datetime import datetime
headers = {'content-type': 'application/json;charset=UTF-8'}

def date_diff_in_seconds(dt2, dt1):
    timedelta = dt2 - dt1
    return timedelta.days * 24 * 3600 + timedelta.seconds

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get something readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

# 1. Criação do índice de filmes TMDB

## Criar índice e indexar

In [328]:
#Extrai os dados do json tmdb.json
def extract():
    f = open('data/tmdb-BR.json', encoding='UTF-8')
    if f:
         return json.loads(f.read());        
    return {}

In [329]:
#Cria um índice novo no Solr e reindexa os dados
def reindex_solr(movieDict={}, delete=True):
    if delete:
        resp = requests.get("http://localhost:8983/solr/admin/collections?action=DELETE&name=tmdb")
        resp = requests.get("http://localhost:8983/solr/admin/collections?action=CREATE&name=tmdb&numShards=1")
        print("solr building...", resp.status_code)
    
    movies = ""
    
    for id, movie in movieDict.items():
        movies += json.dumps(movie) + ","
    
    bulkMovies = "[" + movies + "]"

    print("solr indexing...")
    resp = requests.post("http://localhost:8983/solr/tmdb/update/json/docs?commit=true", data=bulkMovies, headers=headers)
    print("solr indexing done.", resp.status_code)

In [330]:
#Cria um índice novo no Elastic e reindexa os dados
def reindex_elastic(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
    }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://localhost:9200/tmdb") #D
    resp = requests.put("http://localhost:9200/tmdb", 
                        data=json.dumps(settings), headers=headers)

    print("elastic building...", resp.status_code)
    
    if resp.status_code != 200:
        print(resp.text)
    
    bulkMovies = ""
    
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            #"_type": "movie",
                            "_id": movie['id']}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print("elastic indexing...")
    resp = requests.post("http://localhost:9200/_bulk", data=bulkMovies, headers=headers)
    print("elastic indexing done.", resp.status_code)

In [331]:
movieDict = extract()

#Chama o método de reindexação indicado da engine com contagem de tempo
def reindex(engine, engine_method):
    start = datetime.now()
     
    engine_method(movieDict=movieDict)
    
    end = datetime.now()
    delta = date_diff_in_seconds(end, start)
    print('%s engine done! (took %d seconds)\n' % (engine, delta))

FileNotFoundError: [Errno 2] No such file or directory: 'tmdb-BR.json'

As chamadas abaixo recriarão um índice novo no Solr e no Elastic indexandos todos os dados com detecção automática de schema.

In [None]:
reindex('Elastic', reindex_elastic)
reindex('Solr', reindex_solr)

## Ver o índice criado

In [None]:
def print_solr():
    resp = requests.get("http://localhost:8983/solr/tmdb/schema")
    json_schema = json.loads(resp.text)['schema']
    
    print('Fields:')
    fields = json_schema['fields']
    print (json.dumps(fields, indent=2, sort_keys=True))
    
    print('\nField Type text_general:')
    field_types = json_schema['fieldTypes']
    text_general = next((item for item in field_types if item['name'] == 'text_general'), None)
    print (json.dumps(text_general, indent=2, sort_keys=True))

In [None]:
def print_elastic():
    resp = requests.get("http://localhost:9200/tmdb")
    json_object = json.loads(resp.text)
    print (json.dumps(json_object, indent=2, sort_keys=True))

Apresenta os detalhes do índice criado no Elasticsearch. <b>Observem os nomes dos campos e os tipos.</b>

In [None]:
print('Elastic results:')
print_elastic()

Apresenta os detalhes do índice criado no Solr. <b>Observem os nomes dos campos e os tipos.</b>

In [None]:
print('Solr results:')
print_solr()

# 2. Pesquisa básica

Com o índice criado no modo default, vamos realizar uma pesquisa por um filme. Queremos encontrar o filme que é sobre basquete com alienígenas, pois não lembramos o nome do filme. :)

In [None]:
users_search = 'Basquete com alienígenas'
from IPython.display import Image
Image(filename='img/space_jam.jpg') 

In [None]:
#Faz a pesquisa especificada no Solr e imprime os resultados 
def search_solr(usersSearch, qf='title^10 overview'):
    url = 'http://localhost:8983/solr/tmdb/select?q='+ usersSearch + '&defType=edismax&qf=' + qf + '&rows=30&wt=json&fl=title,score'
    httpResp = requests.get(url, headers=headers) #A
    searchHits = json.loads(httpResp.text)['response']['docs']
    print("Solr results")
    print("Num\tRelevance Score\t\tMovie Title") #B
    for idx, hit in enumerate(searchHits):
        print ("%s\t%s\t\t%s" % (idx + 1, hit['score'], hit['title']))
    print("\n")

In [None]:
#Faz a pesquisa especificada no Elasticsearch e imprime os resultados
def search_elastic(usersSearch, query=None):
    if not query:
        query = {
            'query': {
                'multi_match': { 
                    'query': usersSearch, #A
                    'fields': ['title^10', 'overview'] #B
                }
            },
            'size': '30'
        }
    
    url = 'http://localhost:9200/tmdb/_search'
    httpResp = requests.get(url, data=json.dumps(query), headers=headers) #A
    searchHits = json.loads(httpResp.text)['hits']
    print("Elasticsearch results")
    print("Num\tRelevance Score\t\tMovie Title") #B
    for idx, hit in enumerate(searchHits['hits']):
            print ("%s\t%s\t\t%s" % (idx + 1, hit['_score'], hit['_source']['title']))
    print("\n")
    return query

In [None]:
query = search_elastic(users_search)

In [None]:
search_solr(users_search)

Assim, o veredicto é:

In [None]:
from IPython.display import display, HTML

HTML('''<table>
        <tr><td><b>Solr 0 x 0 Elasticssearch</b></td></tr>
        <tr><td><img src="img/source.gif"></td></tr>
    </table>''')

## O que aconteceu? 

### Visualização da query lucene que a engine gerou

In [None]:
def explain_solr(users_search):
    url = 'http://localhost:8983/solr/tmdb/select?q='+ users_search + '&debugQuery=true&defType=edismax&qf=title^10 overview&rows=1&wt=json&fl=title,score'
    httpResp = requests.get(url, headers=headers) #A
    explain = json.loads(httpResp.text)['debug']['parsedquery']
    print('Explicação da query no Solr:')
    print(explain)
    print('\n')

In [None]:
def explain_elastic(users_search):
    query = {
       'query': {
            'multi_match': { 
                'query': users_search,  #User's query
                'fields': ['title^10', 'overview']
            }
        }
    }
    httpResp = requests.get('http://localhost:9200/tmdb/_validate/query?explain',data=json.dumps(query), headers=headers)
    print('Explicação da query no Elasticsearch:')
    json_str= json.dumps(json.loads(httpResp.text), indent=2, ensure_ascii=False).encode('utf-8')
    print(json_str.decode())
    print('\n')

In [None]:
explain_elastic(users_search)
explain_solr(users_search)

### Debug da análise da query

Análise no Solr

In [None]:
resp = requests.get("http://localhost:8983/solr/tmdb/analysis/field?analysis.fieldname=title&analysis.query=" + 
                    users_search +
                    "&analysis.showmatch=true&wt=json")

json_str= json.dumps(json.loads(resp.text), indent=2, ensure_ascii=False).encode('utf-8')
print(json_str.decode())

Análise no Elasticsearch

In [None]:
import urllib.parse

data='{"field": "title", "text" : "'+ users_search + '"}'   
params = data.encode()

resp = requests.get('http://localhost:9200/tmdb/_analyze', data=params, headers=headers)

json_str= json.dumps(json.loads(resp.text), indent=2, ensure_ascii=False).encode('utf-8')
print(json_str.decode())

# Entendendo o resultado

### Elasticsearch

"explanation": "((overview:basquete overview:com overview:alienígenas) | (title:basquete title:com title:alienígenas)^10.0)"

In [None]:
Image(filename='img/field_centric.png') 

In [None]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/_search', data=json.dumps(query), headers=headers)
jsonResp = json.loads(httpResp.text)
print("Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][0]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][1]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][2]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][3]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][25]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][25]['_explanation']))

### Solr

+(DisjunctionMaxQuery((overview:basquete | (title:basquete)^10.0)) DisjunctionMaxQuery((overview:com | (title:com)^10.0)) DisjunctionMaxQuery((overview:alienígenas | (title:alienígenas)^10.0)))

In [None]:
Image(filename='img/term_centric.png') 

In [None]:
url = 'http://localhost:8983/solr/tmdb/select?q='+ users_search + '&debug=True&defType=edismax&qf=title^10 overview&rows=30&wt=json&fl=id,title,score'
httpResp = requests.get(url, headers=headers)
docs = json.loads(httpResp.text)['response']['docs']
debug = json.loads(httpResp.text)['debug']['explain']
print("Explain for %s" % docs[0]['title'])
print(debug[docs[0]['id']])
print("Explain for %s" % docs[1]['title'])
print(debug[docs[1]['id']])
print("Explain for %s" % docs[2]['title'])
print(debug[docs[2]['id']])
print("Explain for %s" % docs[3]['title'])
print(debug[docs[3]['id']])
print("Explain for %s" % docs[25]['title'])
print(debug[docs[25]['id']])

# 3. Melhorando os resultados

Vamos modificar a análise para melhorar os resultados.

### Elasticsearch

Vamos alterar o analisador para o analyser <b>portuguese</b>.<p>
(https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#portuguese-analyzer)

Este analisador vem com o elasticsearch e o pipeline dele é:
* "lowercase"
* "portuguese_stop"
* "portuguese_keywords"
* "portuguese_stemmer"

In [None]:
mappingSettings = {
       'properties': {
               'title': { #A
                   'type': 'text',
                   'analyzer': 'portuguese'
               },
                'overview': {
                   'type': 'text',
                   'analyzer': 'portuguese'
               }
        }
}
reindex_elastic(mappingSettings=mappingSettings, movieDict=movieDict)

Verificando o resultado da análise após a alteração

In [None]:
import urllib.parse

data='{"field": "title", "text" : "'+ users_search + '", "explain" : true}'   
params = data.encode()

resp = requests.get('http://localhost:9200/tmdb/_analyze', data=params, headers=headers)

json_str= json.dumps(json.loads(resp.text), indent=2, ensure_ascii=False).encode('utf-8')
print(json_str.decode())

Vamos repetir a pesquisa

In [None]:
print(users_search)
query = search_elastic(users_search)

Vamos agora ver a explicação do cálculo

In [None]:
query['explain'] = True
httpResp = requests.get('http://localhost:9200/tmdb/_search', data=json.dumps(query), headers=headers)
jsonResp = json.loads(httpResp.text)
print("Explain for %s" % jsonResp['hits']['hits'][0]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][0]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][1]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][1]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][2]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][2]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][3]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][3]['_explanation']))
print("Explain for %s" % jsonResp['hits']['hits'][25]['_source']['title'])
print(simplerExplain(jsonResp['hits']['hits'][25]['_explanation']))

### Solr

Vamos alterar o analisador para o analyser <b>text_pt</b>.<p>

<b>Não vamos confundir o text_pt default do Solr com o text_pt utilizado na busca!</b>

O text_pt default do Solr tem o seguinte pipeline:
 
* tokenizer class="solr.StandardTokenizerFactory"
* filter class="solr.LowerCaseFilterFactory"
* filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_pt.txt" ignoreCase="true"
* filter class="solr.PortugueseLightStemFilterFactory"

In [None]:
url = 'http://localhost:8983/solr/tmdb/schema'
data = '{"replace-field":{"name":"title","type":"text_pt","stored":true }}'
httpResp = requests.post(url, data=data,headers=headers)
print(httpResp.text)
data = '{"replace-field":{"name":"overview","type":"text_pt","stored":true }}'
httpResp = requests.post(url, data=data,headers=headers)
print(httpResp.text)

In [None]:
reindex_solr(movieDict=movieDict, delete=False)

Verificando o resultado da análise após a alteração

In [None]:
resp = requests.get("http://localhost:8983/solr/tmdb/analysis/field?analysis.fieldname=title&analysis.query=" + 
                    users_search +
                    "&analysis.showmatch=true&wt=json")

json_str= json.dumps(json.loads(resp.text), indent=2, ensure_ascii=False).encode('utf-8')
print(json_str.decode())
explain_solr(users_search)

Vamos repetir a pesquisa

In [None]:
print(users_search)
search_solr(users_search)

In [None]:
url = 'http://localhost:8983/solr/tmdb/select?q='+ users_search + '&debug=True&defType=edismax&qf=title^10 overview&rows=30&wt=json&fl=id,title,score'
httpResp = requests.get(url, headers=headers)
docs = json.loads(httpResp.text)['response']['docs']
debug = json.loads(httpResp.text)['debug']['explain']
print("Explain for %s" % docs[0]['title'])
print(debug[docs[0]['id']])
print("Explain for %s" % docs[1]['title'])
print(debug[docs[1]['id']])
print("Explain for %s" % docs[2]['title'])
print(debug[docs[2]['id']])
print("Explain for %s" % docs[3]['title'])
print(debug[docs[3]['id']])
print("Explain for %s" % docs[25]['title'])
print(debug[docs[25]['id']])

# 4.	Melhorando os resultados 2

Agora vamos refletir sobre as expectativas do usuário em relação à consulta:

* O usuário espera que se o seu termo de busca seja encontrado no título isso significa que o documento é mais importante?
* Outros questionamentos?

## Elasticsearch

In [None]:
users_search = 'Basquete com alienígenas'
query = {
    'query': {
        'multi_match': { 
            'query': users_search,
            'fields': ['title', 'overview'],
        }
    },
    'explain': True
}

query = search_elastic(users_search, query)

## Solr

In [None]:
search_solr(users_search,qf='title overview')