# Notebook genérico que contém as funções utilizadas nos demais notebooks

### Funções Genéricas

In [None]:
#PARÂMETROS

#Máquina e porta (formato host:port)
SOLR_ADDR='localhost:8983'
ELASTIC_ADDR='localhost:9200'

In [None]:
import json
import requests
import pandas as pd
from datetime import datetime
headers = {'content-type': 'application/json;charset=UTF-8'}

def date_diff_in_seconds(dt2, dt1):
    timedelta = dt2 - dt1
    return timedelta.days * 24 * 3600 + timedelta.seconds

# Some utilities for flattening the explain into something a bit more
# readable. Pass Explain JSON, get readable (ironically this is what Solr's default output is :-p)
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

In [None]:
import pickle

#Retorna um dicionário do python com os dados dos filmes
def extract():
    return pickle.load(open("../Dados/movies.p","rb"))

### Solr: reindexação, pesquisa e explain

In [None]:
#Cria um índice novo no Solr e reindexa os dados
def reindex_solr(movieDict={}, delete=True):
    if delete:
        resp = requests.get("http://" + SOLR_ADDR + "/solr/admin/collections?action=DELETE&name=tmdb")
        resp = requests.get("http://" + SOLR_ADDR + "/solr/admin/collections?action=CREATE&name=tmdb&numShards=1")
        print("solr building...", resp.status_code)
    
    movies = ""
    
    for id, movie in movieDict.items():
        movies += json.dumps(movie) + ","
    
    bulkMovies = "[" + movies + "]"

    print("solr indexing...")
    resp = requests.post("http://" + SOLR_ADDR + "/solr/tmdb/update/json/docs?commit=true", data=bulkMovies, headers=headers)
    print("solr indexing done.", resp.status_code, resp.text)

In [1]:
#Faz a pesquisa especificada no Solr e imprime os resultados 
def get_field(hit, name):
    if name in hit:
        return hit[name]
    return ''

def search_solr(usersSearch, qf='title overview'):
    url = 'http://' + SOLR_ADDR + '/solr/tmdb/select?q='+ usersSearch + '&defType=edismax&qf=' + qf + '&rows=30&wt=json&fl=title,overview,cast.name, directors.name,score'
    httpResp = requests.get(url, headers=headers) #A
    if httpResp.status_code != 200:
        print('Erro ao executar a consulta: ')
        print(httpResp.text)
        return
    searchHits = json.loads(httpResp.text)['response']['docs']
    print("Solr results")
    lista_resultados = []
    for idx, hit in enumerate(searchHits):
        filme = [idx + 1, hit['score'], hit['title'], get_field(hit,'overview'), hit['cast.name'], hit['directors.name']]
        lista_resultados.append(filme)
    
    pd.set_option('display.max_colwidth', -1)
    
    df = pd.DataFrame(lista_resultados,columns=['Num', 'Relevance Score', 'Movie Title', 'Overview', 'Cast', 'Director'], index=None)
    return df

In [9]:
def explain_solr(users_search, qf='title overview cast.name.bigramed directors.name.bigramed'):
    url = 'http://' + SOLR_ADDR + '/solr/tmdb/select?q='+ users_search + '&debugQuery=true&defType=edismax&qf=' + qf +'&rows=1&wt=json&fl=title,score'
    httpResp = requests.get(url, headers=headers)
    explain = json.loads(httpResp.text)['debug']['parsedquery']
    print('Explicação da query no Solr:')
    print(explain)
    print('\n')

### Elasticsearch: reindexação, pesquisa e explain

In [None]:
#Cria um índice novo no Elastic e reindexa os dados
def reindex_elastic(analysisSettings={}, mappingSettings={}, movieDict={}):
    start = datetime.now()
    
    settings = { #A
        "settings": {
            "number_of_shards": 1, #B
            "index": {
                "analysis" : analysisSettings, #C
    }}}

    if mappingSettings:
        settings['mappings'] = mappingSettings #C

    resp = requests.delete("http://" + ELASTIC_ADDR + "/tmdb") #D
    resp = requests.put("http://" + ELASTIC_ADDR + "/tmdb", 
                        data=json.dumps(settings), headers=headers)

    print("elastic building...", resp.status_code)
    
    if resp.status_code != 200:
        print(resp.text)
    
    bulkMovies = ""
    
    for id, movie in movieDict.items(): 
        addCmd = {"index": {"_index": "tmdb", #E
                            #"_type": "movie",
                            "_id": movie['id']}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    print("elastic indexing...")
    resp = requests.post("http://" + ELASTIC_ADDR + "/_bulk", data=bulkMovies, headers=headers)
    print("elastic indexing done.", resp.status_code)
    
    end = datetime.now()
    delta = date_diff_in_seconds(end, start)
    print('Elastic done! (took %d seconds)\n' % (delta))

In [None]:
def get_cast_list(cast, n):
    i = 0
    cast_list = []
    for e in cast:
        i += 1
        cast_list.append(e['name'])
    return cast_list

In [None]:
#Faz a pesquisa especificada no Elasticsearch e imprime os resultados
def search_elastic(usersSearch, query=None):
    if not query:
        query = {
            'query': {
                'multi_match': { 
                    'query': usersSearch, #A
                    'fields': ['title^10', 'overview'] #B
                }
            },
            'size': '30'
        }
    
    url = 'http://'+ ELASTIC_ADDR +'/tmdb/_search'
    httpResp = requests.get(url, data=json.dumps(query), headers=headers) #A
    searchHits = json.loads(httpResp.text)['hits']
    print("Elasticsearch results")
    lista_resultados = []
    for idx, hit in enumerate(searchHits['hits']):
        filme = [idx + 1, hit['_score'], hit['_source']['title'], hit['_source']['overview'], get_cast_list(hit['_source']['cast'],10), get_cast_list(hit['_source']['directors'],5)]
        lista_resultados.append(filme)
    
    pd.set_option('display.max_colwidth', -1)
    
    df = pd.DataFrame(lista_resultados,columns=['Num', 'Relevance Score', 'Movie Title', 'Overview', 'Cast', 'Director'], index=None)
    return df

In [None]:
def explain_elastic(query):
    httpResp = requests.get('http://'+ ELASTIC_ADDR +'/tmdb/_validate/query?explain',data=json.dumps(query), headers=headers)
    print('Explicação da query no Elasticsearch:')
    json_str= json.dumps(json.loads(httpResp.text), indent=2, ensure_ascii=False).encode('utf-8')
    print(json_str.decode())
    print('\n')