## JSON script to create the index in the Azure AI Search portal

In [None]:
## The JSON script provided can be used to create an index in the Azure Portal. 
# You can copy and paste it directly into the platform.

""" 
{
  "name": "resume_chatbot_index_v2",
  "defaultScoringProfile": null,
  "fields": [
    {
      "name": "id",
      "type": "Edm.String",
      "searchable": false,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": true,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "vectorEncoding": null,
      "synonymMaps": []
    },
    {
      "name": "title",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "vectorEncoding": null,
      "synonymMaps": []
    },
    {
      "name": "content",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "vectorEncoding": null,
      "synonymMaps": []
    },
    {
      "name": "content_vector",
      "type": "Collection(Edm.Single)",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": 1536,
      "vectorSearchProfile": "myHnswProfile",
      "vectorEncoding": null,
      "synonymMaps": []
    },
    {
      "name": "metadata",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "vectorEncoding": null,
      "synonymMaps": []
    }
  ],
  "scoringProfiles": [],
  "corsOptions": null,
  "suggesters": [],
  "analyzers": [],
  "normalizers": [],
  "tokenizers": [],
  "tokenFilters": [],
  "charFilters": [],
  "encryptionKey": null,
  "similarity": {
    "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
    "k1": null,
    "b": null
  },
  "semantic": null,
  "vectorSearch": {
    "algorithms": [
      {
        "name": "default",
        "kind": "hnsw",
        "hnswParameters": {
          "metric": "cosine",
          "m": 4,
          "efConstruction": 400,
          "efSearch": 500
        },
        "exhaustiveKnnParameters": null
      },
      {
        "name": "default_exhaustive_knn",
        "kind": "exhaustiveKnn",
        "hnswParameters": null,
        "exhaustiveKnnParameters": {
          "metric": "cosine"
        }
      }
    ],
    "profiles": [
      {
        "name": "myHnswProfile",
        "algorithm": "default",
        "vectorizer": null,
        "compression": null
      },
      {
        "name": "myExhaustiveKnnProfile",
        "algorithm": "default_exhaustive_knn",
        "vectorizer": null,
        "compression": null
      }
    ],
    "vectorizers": [],
    "compressions": []
  }
}

"""

## Code to populate the index

In [1]:
import os
import sys
import warnings
sys.path.append(os.path.dirname(os.getcwd()))
warnings.filterwarnings('ignore')
from backend.retriever import Retriever
from config.configuration import load_config
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

parameters = load_config()


### Document Splitting and Embedding for Index Preparation

In [None]:
docs_folder_path = "resume_chatbot/data/splitted"

retriever = Retriever(parameters)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)


def create_documents_list(splitted_docs, title, embeddings, id_counter):
    documents_list = []
    for doc in splitted_docs:
        id_counter += 1
        print('Creating document: ', id_counter)
        document = {
            'id': str(id_counter),
            'title': title,
            'content': doc.page_content,
            'content_vector': embeddings.embed_query(doc.page_content), 
            'metadata': str(splitted_docs[0].metadata).replace("'", "\"")
        }
        documents_list.append(document)
    return documents_list, id_counter

embeddings = OpenAIEmbeddings(
            openai_api_key=parameters['openai_api_key'], 
            openai_api_version=parameters['openai_api_version'], 
            model=parameters['model']
        )


counter = 0
id_counter = 0
all_document_list = []
for doc_path in os.listdir(docs_folder_path):
    print('File: ', doc_path)
    document = retriever.docx_loader(os.path.join(docs_folder_path, doc_path))
    splitted_docs = text_splitter.split_documents(document)
    title = doc_path.split('.')[0]
    document_list, id_counter = create_documents_list(splitted_docs, title, embeddings, id_counter)
    all_document_list.extend(document_list)



File:  companies where I worked.docx
Creating document:  1
File:  Core Competences.docx
Creating document:  2
File:  Courses and Certifications.docx
Creating document:  3
Creating document:  4
Creating document:  5
File:  Education.docx
Creating document:  6
File:  Job Experience.docx
Creating document:  7
Creating document:  8
Creating document:  9
Creating document:  10
Creating document:  11
Creating document:  12
Creating document:  13
File:  Languages.docx
Creating document:  14
File:  Personal Information.docx
Creating document:  15
File:  Projects.docx
Creating document:  16
Creating document:  17
Creating document:  18
Creating document:  19
Creating document:  20
Creating document:  21
Creating document:  22
Creating document:  23
Creating document:  24
Creating document:  25
Creating document:  26
Creating document:  27
Creating document:  28
Creating document:  29
Creating document:  30
Creating document:  31
Creating document:  32
Creating document:  33
Creating document:  

### Upload list of documents to the Azure AI Search Index

In [4]:
## Upload list of Documents to the Index
upload = False
if upload:
    from azure.search.documents import SearchClient
    from azure.core.credentials import AzureKeyCredential
    AZURE_COGNITIVE_SEARCH_CREDENTIAL = AzureKeyCredential(parameters['azure_ai_search_api_key'])
    search_client = SearchClient(endpoint=parameters['azure_ai_search_url'], index_name=parameters['index_name'], credential=AZURE_COGNITIVE_SEARCH_CREDENTIAL)

    result = search_client.upload_documents(documents=all_document_list)

    print("Upload of new document succeeded: {}".format(result[0].succeeded))

Upload of new document succeeded: True


In [6]:
# Test a similarity search
vector_store = retriever.get_vector_store()
retrieved_docs = vector_store.similarity_search(
    query="machine learning models",
    k=3,
    search_type="similarity",
)


In [None]:
print(retrieved_docs[2].page_content)