## Import required libraries

In [9]:
import zipfile
import os
import sys
import pandas as pd
import openai
import json  
import wget
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector # Ojo version vieja de la libreria 
from azure.search.documents import SearchIndexingBufferedSender
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,   
)
from dotenv import load_dotenv
load_dotenv()
# Load OpenAI access and other custom paths
sys.path.append(os.path.abspath('../../src'))
from azure_openai_conn import OpenAIembeddings


In [10]:
# Configure OpenAI settings
openai.api_type = os.getenv('OPENAI_API_TYPE')
openai.api_base = os.getenv('AZURE_OPENAI_ENDPOINT')
openai.api_version = os.getenv('OPENAI_API_VERSION')
openai.api_key = os.getenv('OPENAI_API_KEY')
model: str = os.getenv('OPENAI_EMBEDDINGS_MODEL_NAME') 

### Configure Azure Cognitive Search Vector Store settings
You can find this in the Azure Portal or using the [Search Management SDK](https://learn.microsoft.com/rest/api/searchmanagement/)

In [11]:
azure_search_endpoint: str = os.getenv('SEARCH_SERVICE_ENDPOINT') 
azure_search_key: str = os.getenv('SEARCH_SERVICE_API_KEY')
index_name: str = "azure-cognitive-search-vector-demo"
credential = AzureKeyCredential(azure_search_key)

In [12]:
embeddings = OpenAIembeddings()

### FinanceBench-Small (to fit Azure AISearch cuota)

In [39]:
companies = ("MICROSOFT_2023_10K.pdf","JOHNSON&JOHNSON_2022Q4_EARNINGS.pdf","Pfizer_2023Q2_10Q.pdf","COSTCO_2021_10K.pdf","BESTBUY_2017_10K.pdf","BESTBUY_2019_10K.pdf")

In [None]:
destination_folder = '../../data/financebench'
pdf_folder_path = destination_folder
documents = []
for file in companies:
    print(file)
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())

MICROSOFT_2023_10K.pdf
JOHNSON&JOHNSON_2022Q4_EARNINGS.pdf
Pfizer_2023Q2_10Q.pdf
COSTCO_2021_10K.pdf
BESTBUY_2017_10K.pdf
BESTBUY_2019_10K.pdf


In [None]:
chunk_size=1024
overlap=100
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap, add_start_index=True)
chunked_documents = text_splitter.split_documents(documents)

In [None]:
len(chunked_documents)

1973

In [59]:
chunked_documents[0]

Document(page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n \nFORM \n10-K\n \n \n☒\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n \n \n \nFor the Fiscal Year Ended\n June 30, \n2023\n \n \n \nOR\n \n \n☐\nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n \n \n \nFor the Transition Period From                  to\nCommission File Number \n001-37845\n \n \nMICROSOFT CORPORATION\n \n \nWASHINGTON\n \n91-1144442\n(STATE OF INCORPORATION)\n \n(I.R.S. ID)\nONE MICROSOFT WAY\n, \nREDMOND\n, \nWASHINGTON\n \n98052-6399\n(\n425\n) \n882-8080\nwww.microsoft.com/investor\n \n \n \n \n \nSecurities registered pursuant to Section 12(b) of the Act:\n \n \n \n \n \n \n \n \n \nTitle of each class\n \nTrading Symbol\n \nName of exchange on which registered\n \n \n \n \n \nCommon stock, $\n0.00000625\n par value per share\n \nMSFT\n \nNASDAQ\n3.125% Notes due 2028\n \nMSFT\n \nNASDAQ

## Simple Index

In [38]:
index_name: str = "financebench-small"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

### Insert document in vector db

In [None]:
vector_store.add_documents(documents=chunked_documents)

### Retrieve

In [52]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="What is the Income Statement of Microsoft",
    k=5,
    search_type="similarity",
)

In [53]:
for d in docs:
    print(f'{d.page_content},\n,{d.metadata}')
    print('-'*100)

manufacturing, marketing, and selling our other products and services; and income taxes.
Highlights from fiscal year 2023 compared with fiscal year 2022 included:
•
Microsoft Cloud revenue increased 22% to $111.6 billion.
•
Office Commercial products and cloud services revenue increased 10% driven by Office 365 Commercial growth of 13%.
•
Office Consumer products and cloud services revenue increased 2% and Microsoft 365 Consumer subscribers increased to 67.0 million.
•
LinkedIn revenue increased 10%.
•
Dynamics products and cloud services revenue increased 16% driven by Dynamics 365 growth of 24%.
•
Server products and cloud services revenue increased 19% driven by Azure and other cloud services growth of 29%.
•
Windows original equipment manufacturer licensing (“Windows OEM”) revenue decreased 25%.
•
Devices revenue decreased 24%.
•
Windows Commercial products and cloud services revenue increased 5%.
•
Xbox content and services revenue decreased 3%.
 
•,
,{'id': 'YzI0ZWE1YzMtZTVkMi00M

### Perform a vector similarity search with relevance scores

In [54]:
docs_and_scores = vector_store.similarity_search_with_relevance_scores(
    query="What is the Income Statement of Microsoft",
    k=4,
    score_threshold=0.80,
)
from pprint import pprint
pprint(docs_and_scores)

[(Document(page_content='manufacturing, marketing, and selling our other products and services; and income taxes.\nHighlights from fiscal year 2023 compared with fiscal year 2022 included:\n•\nMicrosoft Cloud revenue increased 22% to $111.6 billion.\n•\nOffice Commercial products and cloud services revenue increased 10% driven by Office 365 Commercial growth of 13%.\n•\nOffice Consumer products and cloud services revenue increased 2% and Microsoft 365 Consumer subscribers increased to 67.0 million.\n•\nLinkedIn revenue increased 10%.\n•\nDynamics products and cloud services revenue increased 16% driven by Dynamics 365 growth of 24%.\n•\nServer products and cloud services revenue increased 19% driven by Azure and other cloud services growth of 29%.\n•\nWindows original equipment manufacturer licensing (“Windows OEM”) revenue decreased 25%.\n•\nDevices revenue decreased 24%.\n•\nWindows Commercial products and cloud services revenue increased 5%.\n•\nXbox content and services revenue dec