## Import required libraries

In [9]:
import zipfile
import os
import sys
import pandas as pd
import openai
import json  
import wget
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector # Ojo version vieja de la libreria 
from azure.search.documents import SearchIndexingBufferedSender
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,   
)
from dotenv import load_dotenv
load_dotenv()
# Load OpenAI access and other custom paths
sys.path.append(os.path.abspath('../../src'))
from azure_openai_conn import OpenAIembeddings


In [10]:
# Configure OpenAI settings
openai.api_type = os.getenv('OPENAI_API_TYPE')
openai.api_base = os.getenv('AZURE_OPENAI_ENDPOINT')
openai.api_version = os.getenv('OPENAI_API_VERSION')
openai.api_key = os.getenv('OPENAI_API_KEY')
model: str = os.getenv('OPENAI_EMBEDDINGS_MODEL_NAME') 

### Configure Azure Cognitive Search Vector Store settings
You can find this in the Azure Portal or using the [Search Management SDK](https://learn.microsoft.com/rest/api/searchmanagement/)

In [11]:
azure_search_endpoint: str = os.getenv('SEARCH_SERVICE_ENDPOINT') 
azure_search_key: str = os.getenv('SEARCH_SERVICE_API_KEY')
index_name: str = "azure-cognitive-search-vector-demo"
credential = AzureKeyCredential(azure_search_key)

In [12]:
embeddings = OpenAIembeddings()

## Simple Index

In [38]:
index_name: str = "financebench-small"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

### Insert document in vector db

In [39]:
companies = ("MICROSOFT_2023_10K.pdf","JOHNSON&JOHNSON_2022Q4_EARNINGS.pdf","Pfizer_2023Q2_10Q.pdf","COSTCO_2021_10K.pdf","BESTBUY_2017_10K.pdf","BESTBUY_2019_10K.pdf")

In [40]:
len(companies)

6

In [33]:
destination_folder = '../../data/financebench'
pdf_folder_path = destination_folder
documents = []
for file in companies:
    print(file)
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())

MICROSOFT_2023_10K.pdf
JOHNSON&JOHNSON_2022Q4_EARNINGS.pdf
Pfizer_2023Q2_10Q.pdf
COSTCO_2021_10K.pdf
BESTBUY_2017_10K.pdf
BESTBUY_2019_10K.pdf


In [35]:
chunk_size=1024
overlap=100
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap, add_start_index=True)
chunked_documents = text_splitter.split_documents(documents)

In [41]:
len(chunked_documents)

1973

In [42]:
vector_store.add_documents(documents=chunked_documents)

['ODg1NmQ2ODItZWE5ZS00Yjc0LWEwYmEtNWNiNzEzZjlhZDcy',
 'M2UxOWVmOWUtMjQ1MC00MWE1LTkwMDMtMWIwZTQ5N2NlOGM5',
 'NzdlM2ZjYWUtOTVjYi00OTk1LWExMmUtYjI3YWJmMTdiMzZk',
 'MjZhODg5YTItNTAwYi00YmU2LTg5ZjctMzA2NjZiNGJjY2Rk',
 'MjUzZjIxZTktMzE5Yi00YzE3LWJkNzQtNzA5NDA1NTljY2Jm',
 'ZDQ0ZjgwOWEtNDVjMC00OTYwLWE0YmEtMzJhNmE1NTI2MDBj',
 'MDM0NTA5NmMtNzQyNS00OTIxLThmMmQtMWVjNGQ4YWU5YjRk',
 'MzdjZjFiZjYtMDRiMy00MWIxLTlmMGEtYjc5NzBhZjVmOGFi',
 'NjQyM2M2NGEtZTlmMi00ZDU1LTk3ODEtNWRlNTY2MTVjYWJi',
 'N2VkNjBiMTAtOWYwZi00Njk5LTkxZmEtMzg2NjU0MzA4OGE5',
 'YzUzODViMTYtOTEwZC00YzY1LThlZmItMGFiNzUwZjYyMmJh',
 'YmFlMThmYTQtOWEzOC00NGEyLWI5M2UtNDNlMzk0NDliOTJm',
 'ZWQ5MWRhNWYtZTMxZS00NDlmLTk4MTktNmY2OGUzZmE1M2Nk',
 'MWExNGZjZmQtMWI4Ni00Mjg0LThkZjQtODZhNjg3MmQwMDk0',
 'OTYyNTM4Y2YtNmMxYy00MGU1LWEyNGQtNTE4OTRjMzkyZDc5',
 'MGRiN2RiZDMtNGU0ZC00Y2IwLWFiYjctOTg0OGQwZWQ2NTVi',
 'ODNiMmM0NTktODZlNC00NzA0LWFkYzYtZWZjMjY5NjFlYmI1',
 'NmUzYmFlOTktNjcxMS00MTJlLTllN2QtODI3MTU1MTJlOWRk',
 'YmVlMGIyYzAtNjRlNi00MTZkLWFhMWQtNWUyMzY5ODFm

### Retrieve

In [43]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="What is the CONSOLIDATED STATEMENTS OF INCOME of Microsof",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

PART II
Item 8
 
REPORT OF INDEPENDENT REGIST
ERED PUBLIC ACCOUNTING FIRM
To the Stockholders and the Board of Directors of Microsoft Corporation
 
Opinion on the Financial Statements
We have audited the accompanying consolidated balance sheets of Microsoft Corporation and subsidiaries (the "Company") as of June 30, 2023
 
and 2022, the related consolidated statements of income, comprehensive income, cash flows, and stockholders' equity, for each of the three
 
years in the period ended June 30, 2023, and the related notes (collectively referred to as the "financial statements"). In our opinion, the financial
 
statements present fairly, in all material respects, the financial position of the Company as of June 30, 2023 and 2022, and the results of its
 
operations and its cash flows for each of the three years in the period ended June 30, 2023, in conformity with accounting principles generally
 
accepted in the United States of America.


In [44]:
for d in docs:
    print(f'{d.page_content},\n,{d.metadata}')
    print('-'*100)

PART II
Item 8
 
REPORT OF INDEPENDENT REGIST
ERED PUBLIC ACCOUNTING FIRM
To the Stockholders and the Board of Directors of Microsoft Corporation
 
Opinion on the Financial Statements
We have audited the accompanying consolidated balance sheets of Microsoft Corporation and subsidiaries (the "Company") as of June 30, 2023
 
and 2022, the related consolidated statements of income, comprehensive income, cash flows, and stockholders' equity, for each of the three
 
years in the period ended June 30, 2023, and the related notes (collectively referred to as the "financial statements"). In our opinion, the financial
 
statements present fairly, in all material respects, the financial position of the Company as of June 30, 2023 and 2022, and the results of its
 
operations and its cash flows for each of the three years in the period ended June 30, 2023, in conformity with accounting principles generally
 
accepted in the United States of America.,
,{'id': 'OTMzNjM5ZDQtYmNlMy00YjEzLWFhMWItOTM4M

### Perform a vector similarity search with relevance scores

In [45]:
docs_and_scores = vector_store.similarity_search_with_relevance_scores(
    query="What is the CONSOLIDATED STATEMENTS OF INCOME of Microsoft",
    k=4,
    score_threshold=0.80,
)
from pprint import pprint
pprint(docs_and_scores)

[(Document(page_content='PART II\nItem 8\n \nREPORT OF INDEPENDENT REGIST\nERED PUBLIC ACCOUNTING FIRM\nTo the Stockholders and the Board of Directors of Microsoft Corporation\n \nOpinion on the Financial Statements\nWe have audited the accompanying consolidated balance sheets of Microsoft Corporation and subsidiaries (the "Company") as of June 30, 2023\n \nand 2022, the related consolidated statements of income, comprehensive income, cash flows, and stockholders\' equity, for each of the three\n \nyears in the period ended June 30, 2023, and the related notes (collectively referred to as the "financial statements"). In our opinion, the financial\n \nstatements present fairly, in all material respects, the financial position of the Company as of June 30, 2023 and 2022, and the results of its\n \noperations and its cash flows for each of the three years in the period ended June 30, 2023, in conformity with accounting principles generally\n \naccepted in the United States of America.', m

### Perform a hybrid search

In [46]:
# Perform a hybrid search
docs = vector_store.hybrid_search(
    query="What is the CONSOLIDATED STATEMENTS OF INCOME of Microsoft", k=3
)
from pprint import pprint
pprint(docs)

[Document(page_content='PART II\nItem 8\n \nREPORT OF INDEPENDENT REGIST\nERED PUBLIC ACCOUNTING FIRM\nTo the Stockholders and the Board of Directors of Microsoft Corporation\n \nOpinion on the Financial Statements\nWe have audited the accompanying consolidated balance sheets of Microsoft Corporation and subsidiaries (the "Company") as of June 30, 2023\n \nand 2022, the related consolidated statements of income, comprehensive income, cash flows, and stockholders\' equity, for each of the three\n \nyears in the period ended June 30, 2023, and the related notes (collectively referred to as the "financial statements"). In our opinion, the financial\n \nstatements present fairly, in all material respects, the financial position of the Company as of June 30, 2023 and 2022, and the results of its\n \noperations and its cash flows for each of the three years in the period ended June 30, 2023, in conformity with accounting principles generally\n \naccepted in the United States of America.', me

## Index with metadata

In [25]:
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

embedding_function = embeddings.embed_query

In [26]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

In [27]:
index_name: str = "cocacola-vector-demo-custom"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields,
)

In [28]:
# Data in the metadata dictionary with a corresponding field in the index will be added to the index
# In this example, the metadata dictionary contains a title, a source and a random field
# The title and the source will be added to the index as separate fields, but the random won't. (as it is not defined in the fields list)
# The random field will be only stored in the metadata field
vector_store.add_texts(
    ["Test 1"], # Document!
    [
        {"title": "10-K", "source": "EDGAR", "random": "10290"},   # Metadata     
    ],
)

['ZTJlNDI2MTAtYzgxOC00ZDgyLWFlYWUtNDMzNWY3YWRiN2Y3']

In [29]:
res = vector_store.similarity_search(query="Test 1", k=3, search_type="hybrid")
res

[Document(page_content='Test 1', metadata={'id': 'ZTJlNDI2MTAtYzgxOC00ZDgyLWFlYWUtNDMzNWY3YWRiN2Y3', 'title': '10-K', 'source': 'EDGAR', 'random': '10290'})]