# Step 4: Azure Cognitive Search

**Content**

* Create Azure Search Index
* Create Azure Search JSON
* Upload JSON documents on Azure Search

### References: 
* https://docs.microsoft.com/en-us/learn/modules/intro-to-azure-search/2-what-is-azure-search
* https://docs.microsoft.com/en-us/azure/search/cognitive-search-tutorial-blob-python
* https://docs.microsoft.com/en-us/azure/search/search-get-started-python
* https://docs.microsoft.com/pt-br/python/api/overview/azure/search-documents-readme?view=azure-python.
* https://docs.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python
* https://github.com/Azure-Samples/azure-search-python-samples/blob/master/Quickstart/REST/azure-search-quickstart.ipynb
* https://docs.microsoft.com/en-us/rest/api/searchservice/addupdate-or-delete-documents
* https://docs.microsoft.com/en-us/rest/api/searchservice/create-index
* https://docs.microsoft.com/en-us/azure/search/search-indexer-troubleshooting
* https://docs.microsoft.com/pt-br/azure/search/search-what-is-an-index

In [None]:
! pip install azure-search-documents


In [None]:
import json
import requests
from pprint import pprint
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
     ComplexField,
     CorsOptions,
     SearchIndex,
     ScoringProfile,
     SearchFieldDataType,
     SimpleField,
     SearchableField
 )
import yaml, os
import os.path 
from os import listdir
from collections import Counter


Load Configs

In [None]:
config_file = os.path.join("config","config.yaml")
with open(config_file, 'r') as ymlfile:
    config = yaml.load(ymlfile, Loader=yaml.FullLoader)

##### Configurações do Search:
service_name=config['search']['service_name']
admin_key = config['search']['admin_key']
index_name=config['search']['index_name']
endpoint = "https://{}.search.windows.net/".format(service_name)
api_version = config['search']['api_version']
headers = {'Content-Type': 'application/json',
           'api-key': admin_key}

##### Configurações do Azure Storage
container_name= config['azure_storage']['container_name_audios']
az_storage_sas_token = config['azure_storage']['sas_token']
az_storage_name = config['azure_storage']['storage_name']
az_storage_uri = "https://{name}.dfs.core.windows.net/{container}/".format(name=az_storage_name, container=container_name)


In [None]:
search_client = SearchClient(endpoint=endpoint,
                           index_name=index_name,
                           credential=AzureKeyCredential(admin_key))

admin_client = SearchIndexClient(endpoint=endpoint,
                           index_name=index_name,
                           credential=AzureKeyCredential(admin_key))

    
def delete_search_index(service_name, index_name, admin_key):
    '''Delete a search index by SDK '''
    admin_client = SearchIndexClient(endpoint=endpoint,
                           index_name=index_name,
                           credential=AzureKeyCredential(admin_key))
    try:
        result = admin_client.delete_index(index_name)
        print ('Index', index_name, 'Deleted')
    except Exception as ex:
        print (ex)
        

## Create Azure Search Index

In [None]:
#Informações sobre os campos do index
index_schema = {
   "name": index_name,
   "fields": [
     {"name": "nome_audio", "type": "Edm.String", "key": "true", "filterable": "true"},
     {"name": "blob_location", "type": "Edm.String", "searchable": "false", "filterable": "false", "sortable": "false", "facetable": "false"},
     {"name": "classificacao", "type": "Collection(Edm.String)", "searchable": "true", "filterable": "true", "sortable": "false", "facetable": "true"},
     {"name": "protocolo", "type": "Edm.String", "searchable": "true", "filterable": "false", "sortable": "false", "facetable": "false"},
     {"name": "placa", "type": "Edm.String", "searchable": "true", "filterable": "false", "sortable": "false", "facetable": "false"},
     {"name": "email", "type": "Edm.String", "searchable": "true", "filterable": "false", "sortable": "false", "facetable": "false"}
    ]
   }

In [None]:
def create_search_index_rest(endpoint, api_version, headers, index_schema):
    '''Create a search index by REST API'''
    url = endpoint + "indexes" + api_version
    response  = requests.post(url, headers=headers, json=index_schema)
    print(response.status_code)
    print(response.text)

In [None]:
delete_search_index(service_name, index_name, admin_key)

In [None]:
create_search_index_rest(endpoint, api_version, headers, index_schema)

## Create Azure Search JSON

Create the json document to populate de Azure Search Index

In [None]:
#De para das classes que o luis identificou para cada audio para as tags que serão pesquisadas no Search

classe_tags = {
    "classe_roubo" : ["roubo"],
    "classe_furto" : ["furto"],
    "classe_colisao" : ["colisão"],
    "classe_quebra_retrovisores" : ["quebra retrovisores"],
    "None" : ["nenhum", "não indentificado", "indefinido"]
}

In [None]:
def list_files(dir):
    '''Listar arquivos em um diretório específico no SO'''
    return [f for f in listdir(dir) if os.path.isfile(os.path.join(dir, f))]


def read_json_file(file_path):
    '''Ler arquivos sjon e retornar seu conteúdo'''
    with open(file_path, 'r') as json_file:
        return json.load(json_file)


Summarizing the transcriptions results

In [None]:

result_json = dict()
dir_transcricoes = "transcricoes"
for i in list_files("transcricoes"):
    print(i)
    json_data = read_json_file(os.path.join(dir_transcricoes,i))
    #getting only entities not empty
    entities = [tok['entities'][0] for tok in json_data['result'] if len(tok['entities']) > 0]
    #print(entities)
    #filtering just type and entity keys
    filtered_keys = [ "type", "entity"]
    entities_filtered = [dict((k, d[k]) for k in filtered_keys) for d in entities]
    #print(entities_filtered)
    intents_summarized = dict(Counter(tok['topScoringIntent']['intent'] for tok in json_data['result']) )
    #print(intents_summarized)
    #só as classes None ou classe_
    result_json[i]= {"classes" : {k:v for k,v in intents_summarized.items() if 'classe_' in k or 'None' in k},
                     "entities" : entities_filtered}
    print(result_json[i])

    #print(result_json[i])



In [None]:
#Gerar o documento JSON com as informações do audio para subir no Search

def generate_json_search(json_data):
    json_search = {}
    json_search['value'] = []
    audio_data = {}
    for r in json_data.keys():
        tags = []
        #print(r)
        result = json_data[r].get("classes")
        #print(result)
        #print(result.get("classes"))
        classes = result.keys()
        #print(classes)
        #Se houver mais que uma classe retirar o None
        if (len(classes) > 1):
            result.pop("None")
            for c in classes:
                tags.extend(classe_tags[c])
            #print(tags)
        else:
            for c in classes:
                tags.extend(classe_tags[c])
            #print(tags)
            
        audio_name = r[:-5]

        audio_data = {
            "@search.action": "mergeOrUpload ",
            "nome_audio" : audio_name,
            "blob_location" : az_storage_uri + audio_name + ".wav" + az_storage_sas_token ,
            "classificacao" : tags         
        }

        for e in json_data[r].get("entities"):
            valor = e["entity"]
            if e["type"] == "protocolo":
                 valor = valor.replace(" ", "").replace("-","") 

            audio_data[e["type"].replace("builtin.email", "email")] = valor

        json_search['value'].append(audio_data)
    return json_search
    


In [None]:
json_search = generate_json_search(result_json)

In [None]:
json_search

## Upload JSON documents on Azure Search

In [None]:
def upload_documents_rest(endpoint, headers, index_schema, json_search, api_version):
    '''Realizar o upload do json no Search'''
    url = endpoint + "indexes/"+index_name+"/docs/index" + api_version
    response  = requests.post(url, headers=headers, json=json_search)
    index_content = response.json()
    pprint(index_content)

In [None]:
upload_documents_rest(endpoint, headers, index_schema, json_search, api_version)

To Test, go to azure portal on Azure Cognitivive Service and click on **Search Explorer**