In [2]:
from dotenv import load_dotenv,dotenv_values
import json
import os
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticField,  
    SearchField,  
    VectorSearch,  
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)  
  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
model = os.getenv("MODEL_NAME")
credential = AzureKeyCredential(key)

In [4]:
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="line", type=SearchFieldDataType.String),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=384, vector_search_profile_name="myHnswProfile")
]

In [5]:
index_client

<azure.search.documents.indexes._search_index_client.SearchIndexClient at 0x1f8e5432b50>

In [6]:
# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

In [7]:
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="line")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, 
                    semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 biology created


In [8]:
# Upload some documents to the index
with open('output/docVectors.json', 'r') as file:  
    documents = json.load(file)  

In [9]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

In [10]:
search_client

<SearchClient [endpoint='https://agsearch001.search.windows.net', index='biology']>

In [11]:
%%time
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

Uploaded 73 documents
CPU times: total: 219 ms
Wall time: 2.74 s


In [12]:
model = SentenceTransformer(os.getenv("MODEL_NAME"))

In [13]:
query = "What is segmentation?"  
query_vector = model.encode([query])[0]
print(query_vector)

[ 1.70245077e-02 -9.67202429e-03 -3.97975780e-02 -4.14157473e-02
 -1.79855712e-03 -7.58505240e-02  5.32115065e-02  1.78187508e-02
  1.60154980e-02 -4.83783111e-02 -2.39067301e-02 -3.55373160e-03
 -2.08270364e-02  3.93998474e-02 -9.52671319e-02 -2.90863812e-02
 -4.94355671e-02  6.14180975e-02 -6.69129863e-02 -3.10570151e-02
  4.07211892e-02  5.90016991e-02 -7.08595291e-02 -2.52509248e-02
  3.31796408e-02  4.70836088e-02 -2.69988365e-02 -3.08165997e-02
  3.26630212e-02  1.64568648e-02  4.49804775e-02  9.02690291e-02
  1.48209944e-01  4.69967760e-02 -1.56246042e-02  3.02223153e-02
  3.94755155e-02  1.08202761e-02  2.13319920e-02  3.03754937e-02
 -4.67632525e-02 -4.76687998e-02 -3.42696942e-02 -4.35898826e-02
  9.45607871e-02  2.52115726e-02 -7.91070610e-03 -1.31769665e-03
 -1.32708671e-03 -1.51302041e-02 -6.85700998e-02 -5.53621762e-02
 -9.70548242e-02  6.37322515e-02  3.73306975e-04  4.80806828e-02
  6.39700219e-02 -5.31220669e-03  5.40202558e-02  6.58710487e-04
  5.36609702e-02 -5.38213

In [14]:
 
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
vector_query = VectorizedQuery(vector=query_vector, k_nearest_neighbors=3, fields="embedding")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["line"],
    top=3
)  
  
for result in results:  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['line']}\n")  

Score: 0.03253968432545662
Content: 3 Diagrammatic sectional view of :
(a) Coelomate (b) Pseudocoelomate
(c) AcoelomateThose animals in which the developing embryo has a thir d germinal layer ,
mesoderm , in between the ectoderm and endoderm, are called
triploblastic  animals (platyhelminthes to chordates, Figure 4.2b).
4.1.5 Segmentation
In some animals, the body is externally and internally divided into
segments with a serial repetition of at least some organs

Score: 0.03201844170689583
Content:  For example, in
earthworm, the body shows this pattern called metameric segmentation
and the phenomenon is known as metamerism .
4.1.6 Notochord
Notochord is a mesodermally derived rod-like structure formed on the
dorsal side during embryonic development in some animals. Animals with
notochord are called chordates and those animals which do not form this
structure are called non-chordates, e.g., porifera to echinoderms.
4

Score: 0.03151364624500275
Content: Segmentation in the body is firs

In [17]:
search_client = SearchClient(service_endpoint, 
                             index_name, 
                             AzureKeyCredential(key))  
vector_query = VectorizedQuery(vector=query_vector, 
                               k_nearest_neighbors=3, 
                               fields="embedding")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["line"],
    query_type=QueryType.SEMANTIC, 
    semantic_configuration_name='my-semantic-config', 
    query_caption=QueryCaptionType.EXTRACTIVE, 
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)  

In [18]:
semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

In [19]:
semantic_answers

[]

In [20]:
for result in results:
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['line']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

Reranker Score: 2.023825168609619
Content: Segmentation in the body is first observed in which of the following:
(a) Platyhelminthes  (b) Aschelminthes  (c) Annelida  (d) Arthropoda
Rationalised 2023-24
54 BIOLOGY
14.Match the following:
(a)Operculum (i)Ctenophora
(b)Parapodia (ii)Mollusca
(c)Scales (iii)Porifera
(d)Comb plates (iv)Reptilia
(e)Radula (v)Annelida
(f)Hairs (vi)Cyclostomata and Chondrichthyes
(g)Choanocytes (vii) Mammalia
(h)Gill slits (viii) Osteichthyes
15
Caption: <em>segmentation</em> in the body is first observed in which of the following: (a) platyhelminthes  (b) aschelminthes  (c) annelida  (d) arthropoda rationalised 2023-24 54 biology 14.match the following: (a)operculum (i)ctenophora (b)parapodia (ii)mollusca (c)scales (iii)porifera (d)comb plates (iv)reptilia (e)radula (v)annelida (f)hairs (vi)cyclostomata …

Reranker Score: 1.9124540090560913
Content: 3 Diagrammatic sectional view of :
(a) Coelomate (b) Pseudocoelomate
(c) AcoelomateThose animals in which the 