## Import required libraries

In [10]:
import zipfile
import os
import sys
import pandas as pd
import openai
import json  
import wget
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector # Ojo version vieja de la libreria 
from azure.search.documents import SearchIndexingBufferedSender
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,   
)
from dotenv import load_dotenv
load_dotenv()
# Load OpenAI access and other custom paths
sys.path.append(os.path.abspath('../../src'))
from azure_openai_conn import OpenAIembeddings


In [2]:
# Configure OpenAI settings
openai.api_type = os.getenv('OPENAI_API_TYPE')
openai.api_base = os.getenv('AZURE_OPENAI_ENDPOINT')
openai.api_version = os.getenv('OPENAI_API_VERSION')
openai.api_key = os.getenv('OPENAI_API_KEY')
model: str = os.getenv('OPENAI_EMBEDDINGS_MODEL_NAME') 

### Configure Azure Cognitive Search Vector Store settings
You can find this in the Azure Portal or using the [Search Management SDK](https://learn.microsoft.com/rest/api/searchmanagement/)

In [3]:
azure_search_endpoint: str = os.getenv('SEARCH_SERVICE_ENDPOINT') 
azure_search_key: str = os.getenv('SEARCH_SERVICE_API_KEY')
index_name: str = "azure-cognitive-search-vector-demo"
credential = AzureKeyCredential(azure_search_key)

In [4]:
embeddings = OpenAIembeddings()

## Simple Index

In [18]:
index_name: str = "cocacola-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

### Insert document in vector db

In [14]:
destination_folder = '../../data/financebench'
pdf_folder_path = destination_folder
documents = []
for file in os.listdir(pdf_folder_path)[:1]:
    print(file)
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())

COCACOLA_2021_10K.pdf


In [19]:
chunk_size=1024
overlap=100
embeddings = OpenAIembeddings()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap, add_start_index=True)
chunked_documents = text_splitter.split_documents(documents)

In [20]:
vector_store.add_documents(documents=chunked_documents)

['ZmI1NmNjNGEtN2ZjZS00ZTEzLWI2NjUtMzhlOWNjNDNiNThl',
 'OTgxYmJlMjItODliYy00ZTk2LWJiNzAtMDE4MjJlZDdjMDZk',
 'ZDMxZDIwZTgtODMyMC00YmJmLTlmMDUtNDAyOTFmMjliZTE2',
 'YzgyNTdlZjctNzM4Yi00ZjA0LTlmMmQtODQ2OGYzOTc4NThj',
 'NGE3Y2FjMDgtZWM5Ny00NGM3LThkYTYtNjQ5MWEyOGU4YmY4',
 'NDllYzU0OGEtOTRlZi00NmU5LWFkNWUtYWQwZGY0MGMwYzBk',
 'MzlkNGFmYjctMmRhYy00NzlkLTk2ZDEtMDA4YjIzNzk4Njk5',
 'NmRkNGY3NzMtYjcyMy00MGQyLWFiNzktMTlhNGE5MjNhNjVl',
 'Nzk4ZDc2MmUtOTE0Zi00ZGZiLWFkNjgtMTI0NTIyMmQyMDVl',
 'NDBhZmQzYjctODJlMS00MGZhLTk0ZjAtNTE1MTc5YTA0YzY2',
 'ZWQ4NjZkNjgtZGU3Ni00ZTZkLWEzYTUtMjdhMmFjZDRmNjM4',
 'Y2ExMzM5MzgtNWViZS00MzBjLTlkZDgtOWU0YTZjODk0NTI4',
 'ZTU5OTg1MzktZDE5OC00MjY1LWFhMTEtYTAyNDgyN2RkOTMy',
 'MzhmZjk5YzYtZmQ3My00ZTlhLTk0OGItNmIwMTZiYzc5M2Vj',
 'NzNkZmEwMjItYTEwNy00NzRiLTkzYjYtY2I5MDEyYWEwOGU2',
 'NjI3OTM4NDgtMGRiYS00ZTdiLTliNTUtZDY3MDZmYjkzMTVl',
 'Y2QxOWNhODgtNGZjNi00YzhkLTkwNjAtZWYyZWE5MmE5YWQ5',
 'YzBkZjE2MDUtNjYxNC00MWNlLTk1MTQtYTg0MzQ2NGZjYzQ4',
 'ZWE0YjQ1ODctODU5OS00N2NmLTg2NTItNTc3ZDg5OTFh

In [21]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="What is the CONSOLIDATED STATEMENTS OF INCOME of Coca Cola",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

THE COCA-COLA COMPANY AND SUBSIDIARIES
CONSOLIDATED STATEMENTS OF INCOME
(In millions except per share data)
Year Ended December 31, 2021 2020 2019
Net Operating Revenues $ 38,655 $ 33,014 $ 37,266 
Cost of goods sold 15,357 13,433 14,619 
Gross Profit 23,298 19,581 22,647 
Selling, general and administrative expenses 12,144 9,731 12,103 
Other operating charges 846 853 458 
Operating Income 10,308 8,997 10,086 
Interest income 276 370 563 
Interest expense 1,597 1,437 946 
Equity income (loss) — net 1,438 978 1,049 
Other income (loss) — net 2,000 841 34 
Income Before Income Taxes 12,425 9,749 10,786 
Income taxes 2,621 1,981 1,801 
Consolidated Net Income 9,804 7,768 8,985 
Less: Net income (loss) attributable to noncontrolling interests 33 21 65 
Net Income Attributable to Shareowners of The Coca-Cola Company $ 9,771 $ 7,747 $ 8,920 
Basic Net Income Per Share $ 2.26 $ 1.80 $ 2.09 
Diluted Net Income Per Share $ 2.25 $ 1.79 $ 2.07 
Average Shares Outstanding — Basic 4,315 4,295 4,2

OK, 100% match

In [22]:
for d in docs:
    print(f'{d.page_content},\n,{d.metadata}')
    print('-'*100)

THE COCA-COLA COMPANY AND SUBSIDIARIES
CONSOLIDATED STATEMENTS OF INCOME
(In millions except per share data)
Year Ended December 31, 2021 2020 2019
Net Operating Revenues $ 38,655 $ 33,014 $ 37,266 
Cost of goods sold 15,357 13,433 14,619 
Gross Profit 23,298 19,581 22,647 
Selling, general and administrative expenses 12,144 9,731 12,103 
Other operating charges 846 853 458 
Operating Income 10,308 8,997 10,086 
Interest income 276 370 563 
Interest expense 1,597 1,437 946 
Equity income (loss) — net 1,438 978 1,049 
Other income (loss) — net 2,000 841 34 
Income Before Income Taxes 12,425 9,749 10,786 
Income taxes 2,621 1,981 1,801 
Consolidated Net Income 9,804 7,768 8,985 
Less: Net income (loss) attributable to noncontrolling interests 33 21 65 
Net Income Attributable to Shareowners of The Coca-Cola Company $ 9,771 $ 7,747 $ 8,920 
Basic Net Income Per Share $ 2.26 $ 1.80 $ 2.09 
Diluted Net Income Per Share $ 2.25 $ 1.79 $ 2.07 
Average Shares Outstanding — Basic 4,315 4,295 4,2

### Perform a vector similarity search with relevance scores

In [23]:
docs_and_scores = vector_store.similarity_search_with_relevance_scores(
    query="What is the CONSOLIDATED STATEMENTS OF INCOME of Coca Cola",
    k=4,
    score_threshold=0.80,
)
from pprint import pprint
pprint(docs_and_scores)

[(Document(page_content='THE COCA-COLA COMPANY AND SUBSIDIARIES\nCONSOLIDATED STATEMENTS OF INCOME\n(In millions except per share data)\nYear Ended December 31, 2021 2020 2019\nNet Operating Revenues $ 38,655 $ 33,014 $ 37,266 \nCost of goods sold 15,357 13,433 14,619 \nGross Profit 23,298 19,581 22,647 \nSelling, general and administrative expenses 12,144 9,731 12,103 \nOther operating charges 846 853 458 \nOperating Income 10,308 8,997 10,086 \nInterest income 276 370 563 \nInterest expense 1,597 1,437 946 \nEquity income (loss) — net 1,438 978 1,049 \nOther income (loss) — net 2,000 841 34 \nIncome Before Income Taxes 12,425 9,749 10,786 \nIncome taxes 2,621 1,981 1,801 \nConsolidated Net Income 9,804 7,768 8,985 \nLess: Net income (loss) attributable to noncontrolling interests 33 21 65 \nNet Income Attributable to Shareowners of The Coca-Cola Company $ 9,771 $ 7,747 $ 8,920 \nBasic Net Income Per Share $ 2.26 $ 1.80 $ 2.09 \nDiluted Net Income Per Share $ 2.25 $ 1.79 $ 2.07 \nAver

### Perform a hybrid search

In [24]:
# Perform a hybrid search
docs = vector_store.hybrid_search(
    query="What is the CONSOLIDATED STATEMENTS OF INCOME of Coca Cola", k=3
)
from pprint import pprint
pprint(docs)

[Document(page_content='THE COCA-COLA COMPANY AND SUBSIDIARIES\nCONSOLIDATED STATEMENTS OF COMPREHENSIVE INCOME\n(In millions)\nYear Ended December 31, 2021 2020 2019\nConsolidated Net Income $ 9,804 $ 7,768 $ 8,985 \nOther Comprehensive Income:\nNet foreign currency translation adjustments (699) (911) 74 \nNet gains (losses) on derivatives 214 15 (54)\nNet change in unrealized gains (losses) on available-for-sale debt securities (90) (47) 18 \nNet change in pension and other postretirement benefit liabilities 712 (267) (159)\nTotal Comprehensive Income 9,941 6,558 8,864 \nLess: Comprehensive income (loss) attributable to noncontrolling interests (101) (132) 110 \nTotal Comprehensive Income Attributable to Shareowners of\n  The Coca-Cola Company $ 10,042 $ 6,690 $ 8,754 \nRefer to Notes to Consolidated Financial Statements.\n61', metadata={'id': 'MWQxYmFhZTItNTBkZS00ZDBhLWIwNDgtYTIxOWFhZjZkMTMy', 'source': '../../data/financebench/COCACOLA_2021_10K.pdf', 'page': 62, 'start_index': 0}),

## Index with metadata

In [25]:
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

embedding_function = embeddings.embed_query

In [26]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

In [27]:
index_name: str = "cocacola-vector-demo-custom"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields,
)

In [28]:
# Data in the metadata dictionary with a corresponding field in the index will be added to the index
# In this example, the metadata dictionary contains a title, a source and a random field
# The title and the source will be added to the index as separate fields, but the random won't. (as it is not defined in the fields list)
# The random field will be only stored in the metadata field
vector_store.add_texts(
    ["Test 1"], # Document!
    [
        {"title": "10-K", "source": "EDGAR", "random": "10290"},   # Metadata     
    ],
)

['ZTJlNDI2MTAtYzgxOC00ZDgyLWFlYWUtNDMzNWY3YWRiN2Y3']

In [29]:
res = vector_store.similarity_search(query="Test 1", k=3, search_type="hybrid")
res

[Document(page_content='Test 1', metadata={'id': 'ZTJlNDI2MTAtYzgxOC00ZDgyLWFlYWUtNDMzNWY3YWRiN2Y3', 'title': '10-K', 'source': 'EDGAR', 'random': '10290'})]