# Azure Search End to End

## 1. Load the PDF and convert it into a document list
 This function takes in a file path and converts it into a list of document objects.

In [15]:
import langchain.document_loaders as PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import re

In [17]:
def load_pdf_file(file_path):

    documents = []

    loader = PyPDFLoader.PyPDFLoader(file_path)

    if loader:
        documents.extend(loader.load())
    # Load the PDF document
    print("Document loaded successfully.")

    for document in documents:
        text = document.page_content
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        document.page_content = text

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)

    return texts

# Example usage
texts = load_pdf_file(r"C:\Users\ammar\Downloads\Steven A. Silbiger - The Ten-Day MBA-Harper Business (2012).pdf")
print(texts)

Created a chunk of size 1349, which is longer than the specified 1000
Created a chunk of size 1120, which is longer than the specified 1000
Created a chunk of size 1260, which is longer than the specified 1000
Created a chunk of size 1083, which is longer than the specified 1000
Created a chunk of size 1254, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 1444, which is longer than the specified 1000
Created a chunk of size 1339, which is longer than the specified 1000
Created a chunk of size 1258, which is longer than the specified 1000
Created a chunk of size 1080, which is longer than the specified 1000
Created a chunk of size 1371, which is longer than the specified 1000
Created a chunk of size 1267, which is longer than the specified 1000
Created a chunk of size 1388, which is longer than the specified 1000
Created a chunk of size 1256, which is longer than the specified 1000
Created a chunk of s

Document loaded successfully.


## 2. Embed the each document using OpenAI's embeddings

In [18]:
import openai
import os
from dotenv import load_dotenv, find_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
import hashlib

In [19]:
_ = load_dotenv(find_dotenv())

openai.api_key = os.getenv("OPENAI_API_KEY")
model = 'text-embedding-ada-002'

embeddings = OpenAIEmbeddings(model=model,
                              api_key=openai.api_key,
                              chunk_size=1)

batch_array = []

title = "Lesson - 5: Ten Day MBA"

for i, document in enumerate(texts):
    # Concatenate the elements to create the string to be hashed
    x = title + document.page_content + str(document.metadata['page'])

    # Create a new hash object
    hash_object = hashlib.sha256()

    # Update the hash object with the string, encoded to bytes
    hash_object.update(x.encode('utf-8'))

    # Get the hexadecimal digest of the hash
    hashed_x = hash_object.hexdigest()

    # Add the document data to batch_array with the hashed 'id'
    batch_array.append({
        "id": hashed_x,  # Add the hash as 'id'
        "title": title,
        "title_vector": embeddings.embed_query(title),
        "content": document.page_content,
        "content_vector": embeddings.embed_query(document.page_content),
        "type": 'CLASS_MATERIAL',
        "filename": document.metadata['source'],
        "page_number": document.metadata['page']
    })

print(batch_array[:3])

[{'id': '0d009ed2f9a42d72e075835deed09c39614a46122e1986010af70cd02cc3bce5', 'title': 'Lesson - 5: Ten Day MBA', 'title_vector': [-0.01297802829239562, -0.009683232358623615, 0.008913290248482718, -0.03626356474425003, -0.004612714307280809, 0.01498265088501348, -0.03651327469205505, -0.005802308995766128, -0.01607860341174929, -0.02598380068582327, 0.004411558398927091, 0.021683224744621362, 0.01056415646082838, 0.007165314094589199, 0.0038878591552848362, -0.002349709756224842, 0.02119767596820471, -0.008268204043566957, 0.020712127191788058, 0.0025231201665625925, -0.002448553587671871, -0.0011566468222798624, -0.022543340305390795, 0.00661733736555998, 0.0013473981106699282, -0.019338718753569947, 0.01310981975721745, -0.012166467236738802, -0.0014358374385686334, -0.02150287970600954, 0.027398829932202554, -0.02062889116340147, -0.011986121266804359, -0.011264733661776077, 0.011854328870659903, 0.007893637259214175, -0.007213869065363127, 0.011056641728164352, 0.02019883245169413, 

## 3. Create the index

In [7]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector 
from azure.search.documents import SearchIndexingBufferedSender
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,   
)

In [20]:
# Create a search index
credential = AzureKeyCredential(os.getenv('AZURE_AI_SEARCH_API_KEY'))
index_client = SearchIndexClient(endpoint=os.getenv('AZURE_AI_SEARCH_ENDPOINT'), 
                                 credential=credential)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String , retrievable=True),
    SimpleField(name="filename", type=SearchFieldDataType.String),
    SimpleField(name="page_number", type=SearchFieldDataType.Int32, retrievable=True),
    SearchableField(name="type", type=SearchFieldDataType.String),
    SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration='my-vector-config'),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration='my-vector-config')
] 
# Configure the vector search configuration
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)
# Optional: configure semantic reranking by passing your title, keywords, and content fields
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="filename")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)
# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

index_name = "test1"

# Create the index 
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

test1 created


## 4. Upload Documents

In [21]:
batch_client = SearchIndexingBufferedSender(endpoint=os.getenv('AZURE_AI_SEARCH_ENDPOINT'),
                                            index_name=index_name,
                                            credential=credential)

batch_client.upload_documents(documents=batch_array)

# 5. Query the data

## Vector Search

In [10]:
query = 'What does chapter 5 of the Ten Day MBA cover?'

search_client = SearchClient(os.getenv('AZURE_AI_SEARCH_ENDPOINT'), index_name, credential)  
vector = Vector(value=embeddings.embed_query(query), k=3, fields='content_vector, title_vector')

results = search_client.search(search_text=None,
                               vectors=[vector],
                               select=['title', 'content','page_number','filename'],
                               top=5)

print(results)
for result in results:
    print(f"page number: {result['page_number']}")
    print(f"filename: {result['filename']}")
    print(f"title: {result['title']}")
    print(f"content: {result['content']}")
    print(f"score: {result['@search.score']}")
    print('+------------------------------------------+')
    

<iterator object azure.core.paging.ItemPaged at 0x2167ded9340>
page number: 467
filename: C:\Users\ammar\Downloads\Steven A. Silbiger - The Ten-Day MBA-Harper Business (2012).pdf
title: Lesson - 5: Ten Day MBA
content: Day 10
MBA MINICOURSES
RESEARCH AND COMPETITIVE INTELLIGENCE
GATHERING
PUBLIC SPEAKING
NEGOTIA TING
INTERNA TIONAL BUSINES S
BUSINES S LAW
BUSINES S WRITING
REAL EST ATE INVESTING
LEADERSHIP CO ACH
FINANCIAL PLANNER
 
THE TEN-MINUTE MINICOURSE ON RESEARCH
AND COMPETITIVE INTELLIGENCE GA THERING
It is said that information is power . That is why MBA schools
teach students resear ch skills. The key to eﬃcient and
productive resear ch is to know wher e to seek information.
By putting a little more eﬀort into your job, you as a Ten-Day
MBA may get that brilliant insight or fact that may elude
your less industrious colleagues. Of all the sections in this
book, this one may be the most valuable to you, so I ﬁnish
with it. Suppose you need facts about a competitor, a
person, or

## Hybrid Search

In [11]:
query = 'Economics for MBAs'

search_client = SearchClient(os.getenv('AZURE_AI_SEARCH_ENDPOINT'), index_name, credential)  
vector = Vector(value=embeddings.embed_query(query), k=3, fields='content_vector, title_vector')

results = search_client.search(search_text=query,
                               vectors=[vector],
                               select=['title', 'content','page_number','filename'],
                               top=5)

print(results)
for result in results:
    print(f"page number: {result['page_number']}")
    print(f"filename: {result['filename']}")
    print(f"title: {result['title']}")
    print(f"content: {result['content']}")
    print(f"score: {result['@search.score']}")
    print('+------------------------------------------+')

<iterator object azure.core.paging.ItemPaged at 0x2160720c6d0>
page number: 371
filename: C:\Users\ammar\Downloads\Steven A. Silbiger - The Ten-Day MBA-Harper Business (2012).pdf
title: Lesson - 5: Ten Day MBA
content: Schools like Chicago and MIT place a great deal of
emphasis on learning classical textbook economics, but
most others treat econom ics a bit more on an applied basis.
Harvar d and Darden have integrated economics into their
international studies courses.
Economics can boast about only a few basic concepts. So
how does one explain the endless volumes of comple x
academic literatur e that try to explain the booms and busts
of business cycles? Like the Holy Grail, the perfect economic
model is an elusive target that seduces many zealous
professors and thousands of Ph.D.’s in private industry . In
their wake over the past hundr ed years they have left
thousands of magic formulas, graphs, and charts. An MBA
should aim at understanding the fundamentals and the
vocabulary of ec

## Hybrid Search w/ Reranking

In [25]:
query = 'What is the name of the class materials?'

search_client = SearchClient(os.getenv('AZURE_AI_SEARCH_ENDPOINT'), index_name, credential)  
vector = Vector(value=embeddings.embed_query(query), k=3, fields='content_vector, title_vector')

results = search_client.search(search_text=query,
                               vectors=[vector],
                               select=['title', 'content','page_number','filename', 'type'],
                               query_type='semantic',
                               query_language='en-us',
                               semantic_configuration_name='my-semantic-config',
                               query_caption='extractive',
                               query_answer='extractive',
                               top=5)

semantic_answers = results.get_answers()

for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")  

for result in results:
    print(f"page number: {result['page_number']}")
    print(f"filename: {result['filename']}")
    print(f"title: {result['title']}")
    print(f"content: {result['content']}")
    print(f"score: {result['@search.score']}")
    captions = result['@search.captions']
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")
    print('+------------------------------------------+')

page number: 52
filename: C:\Users\ammar\Downloads\Steven A. Silbiger - The Ten-Day MBA-Harper Business (2012).pdf
title: Lesson - 5: Ten Day MBA
content: The choice of any one of these product diﬀer entiation techniques aﬀects the entire mark eting process, as it lays the groundwork for your promotional eﬀorts. A product can be diﬀer entiated from the competition by creative advertising and promotio n, even if competing products are physically identical. Perceptual maps and positioning can help to diﬀer entiate the product.  All the product attributes  mentioned aﬀect the positioning  of a product in the mark etplace. The mark eter can always call upon his company’s product engineers to develop a product’s physical characteristics if the proﬁts justif y it. As my perceptu al map of paper towels indicated, consumers have speciﬁc needs within a product class, and they perceive each product diﬀer ently. The mark eter’s job is to uniquely position the product (using a perceptua l map as a