In [1]:
from dotenv import load_dotenv,dotenv_values,find_dotenv
import json
import os
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticField,  
    SearchField,  
    VectorSearch,  
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)  
  
# Configure environment variables  
load_dotenv(find_dotenv("../.env.sample"),override = True)
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
model = os.getenv("MODEL_NAME")
credential = AzureKeyCredential(key)

In [3]:
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, 
                key=True, sortable=True, 
                filterable=True, facetable=True),
    SearchableField(name="line", type=SearchFieldDataType.String),
    SearchableField(name="filename", type=SearchFieldDataType.String,
                    filterable=True, facetable=True),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, 
                vector_search_profile_name="myHnswProfile")
]

In [4]:
index_client

<azure.search.documents.indexes._search_index_client.SearchIndexClient at 0x1556c32ef20>

In [5]:
# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

In [6]:
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="line")],
        keywords_fields=[SemanticField(field_name="filename")]
    )
)

In [7]:

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, 
                    semantic_search=semantic_search
                    )
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 snowticket created


In [8]:
# # Upload some documents to the index
# with open('../output/docVectors.json', 'r') as file:
#     documents = json.load(file)  
# Upload some documents to the index
# with open('../output/ExcelDoclineVectors.json', 'r') as file:
#     documents = json.load(file)  
# Upload some documents to the index
with open('../output/ExcelAzureDoclineVectors.json', 'r') as file:
    documents = json.load(file)  

In [9]:
documents[0]

{'@search.action': 'upload',
 'id': '0',
 'line': 'FOR-IT Platform, FOREST, 2023-01-01 09:30:28, 2023-02-20 10:00:05, INC2555629, FOR-IT year end activities not done, e.g. system_number_seq not updated - URGENT, please execute year end activities immediately, for example all transactions are now generated using 2022 receipt numbers., Other, 3 - Moderate, Closed, Hello,\n\nYear end activities are completed.So, closing this INC now\n\nBR,\nRam Biyani, Solved (Permanently)',
 'filename': 'incidents_2023_forest.xlsx',
 'embedding': [-0.018947089090943336,
  -0.02034897170960903,
  -0.03792308270931244,
  0.013303428888320923,
  -0.02387535572052002,
  0.01245073787868023,
  -0.05474567040801048,
  -0.014459620229899883,
  -0.04283689707517624,
  -0.0005604819161817431,
  0.04642109200358391,
  0.015565228648483753,
  -0.00760195916518569,
  -0.0028922853525727987,
  0.011489653028547764,
  0.0017659019213169813,
  0.010579152964055538,
  0.006879339460283518,
  0.01807994581758976,
  -0.00

In [10]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

Uploaded 1527 documents
