# End to end process

This code demonstrate the complete process with the following tasks:
1. Convert HTML files to markdown format
2. Chunk markdown content with the maximum number of tokens specified
3. Create the index and upload the chunks
4. Test the search and answer generation creating the Excel files with the results of answers evaluation

## Prerequisites
+ An Azure subscription, with [access to Azure OpenAI](https://aka.ms/oai/access).
+ A Document Intelligence service with its end-point and API key.
+ An Azure OpenAI service with the service name and an API key.
+ A deployment of the text-embedding-ada-002 embedding model on the Azure OpenAI Service.
+ An Azure AI Search service with the end-point, API Key and the index name to create.

We used Python 3.12.3, [Visual Studio Code with the Python extension](https://code.visualstudio.com/docs/python/python-tutorial), and the [Jupyter extension](https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter) to test this example.

### Set up a Python virtual environment in Visual Studio Code

1. Open the Command Palette (Ctrl+Shift+P).
1. Search for **Python: Create Environment**.
1. Select **Venv**.
1. Select a Python interpreter. Choose 3.10 or later.

It can take a minute to set up. If you run into problems, see [Python environments in VS Code](https://code.visualstudio.com/docs/python/environments).

In [None]:
!pip install openai
!pip install azure-search-documents
!pip install nbimporter
!pip install nbformat

## Import packages and create AOAI and AI Search clients

In [1]:
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
import sys
sys.path.append('..')
from pa_utils import load_files

# Load environment variables from .env
load_dotenv(override=True)

# AZURE AI SEARCH
ai_search_endpoint = os.environ["SEARCH_SERVICE_ENDPOINT"]
ai_search_apikey = os.environ["SEARCH_SERVICE_QUERY_KEY"]
ai_search_index_name = os.environ["SEARCH_INDEX_NAME"]
ai_search_credential = AzureKeyCredential(ai_search_apikey)

# CREATE AZURE AI SEARCH CLIENT
ai_search_client = SearchClient(endpoint=ai_search_endpoint, index_name=ai_search_index_name, credential=ai_search_credential)

aoai_api_version = '2024-02-15-preview'

# AOAI FOR ANSWER GENERATION
aoai_answer_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
aoai_answer_apikey = os.environ["AZURE_OPENAI_API_KEY"]
aoai_answer_model_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
# Create AOAI client for answer generation
aoai_answer_client = AzureOpenAI(
    azure_deployment=aoai_answer_model_name,
    api_version=aoai_api_version,
    azure_endpoint=aoai_answer_endpoint,
    api_key=aoai_answer_apikey
)

# AZURE OPENAI FOR RERANKING
aoai_rerank_endpoint = os.environ["AZURE_OPENAI_RERANK_ENDPOINT"]
azure_openai_rerank_key = os.environ["AZURE_OPENAI_RERANK_API_KEY"]
rerank_model_name = os.environ["AZURE_OPENAI_RERANK_DEPLOYMENT_NAME"]
# Create AOAI client for reranking
aoai_rerank_client = AzureOpenAI(
    azure_deployment=rerank_model_name,
    api_version=aoai_api_version,
    azure_endpoint=aoai_rerank_endpoint,
    api_key=azure_openai_rerank_key
)

# AZURE OPENAI FOR EMBEDDING
aoai_embedding_endpoint = os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"]
azure_openai_embedding_key = os.environ["AZURE_OPENAI_EMBEDDING_API_KEY"]
embedding_model_name_ada = os.environ["AZURE_OPENAI_EMBEDDING_NAME_ADA"]
embedding_model_name_large_3 = os.environ["AZURE_OPENAI_EMBEDDING_NAME_LARGE_3"]
# Create AOAI client for embedding creation (ADA)
aoai_embedding_client_ada = AzureOpenAI(
    azure_deployment=embedding_model_name_ada,
    api_version=aoai_api_version,
    azure_endpoint=aoai_embedding_endpoint,
    api_key=azure_openai_embedding_key
)
# Create AOAI client for embedding creation (LARGE-3)
aoai_embedding_client_large_3 = AzureOpenAI(
    azure_deployment=embedding_model_name_large_3,
    api_version=aoai_api_version,
    azure_endpoint=aoai_embedding_endpoint,
    api_key=azure_openai_embedding_key
)

# Prepare the tests
TESTS = {
        # Test-name: Embeddings_fields | uppercase/lowercase) | embbeding_model | index_name | max_retrieve | max_generate
        "title_content_ada_512_search_upper_20_10": ("embeddingTitle, embeddingContent", "upper", "ada", "project_assurance_ada_512", 20, 10),
        "title_content_ada_512_search_upper_20_20": ("embeddingTitle, embeddingContent", "upper", "ada", "project_assurance_ada_512", 20, 20),
        "title_content_ada_512_search_lower_20_10": ("embeddingTitle, embeddingContent", "lower", "ada", "project_assurance_ada_512", 20, 10),
        "title_content_ada_512_search_lower_20_20": ("embeddingTitle, embeddingContent", "lower", "ada", "project_assurance_ada_512", 20, 20),
        "title_content_large_3_512_search_upper_20_10": ("embeddingTitle, embeddingContent", "upper", "large-3", "project_assurance_large_3_512", 20, 10),
        "title_content_large_3_512_search_upper_20_20": ("embeddingTitle, embeddingContent", "upper", "large-3", "project_assurance_large_3_512", 20, 20),
        "title_content_large_3_512_search_lower_20_10": ("embeddingTitle, embeddingContent", "lower", "large-3", "project_assurance_large_3_512", 20, 10),
        "title_content_large_3_512_search_lower_20_20": ("embeddingTitle, embeddingContent", "lower", "large-3", "project_assurance_large_3_512", 20, 20),
}

## For every HTML file in the input directory, convert to markdown format and chunk them

In [None]:
import nbimporter
from convert_html_to_markdown import get_markdown_with_doc_intel
from chunking_with_max_tokens import chunk_with_max_tokens
from create_index_and_index_documents import create_index, index_documents
from generate_synthetic_qa_pairs import generate_answers_and_questions
from search_and_answer_generation_tests import execute_test
import pandas as pd

input_dir = 'html_files'
html_files = load_files(input_dir, '.html')

i=1
all_chunks = []
qa_data = {'question': [], 'answer': []}
# Read the html files
for i, html_file in enumerate(html_files):
    print(f"[{i + 1}]: {html_file['title']}")
    print(f"\t[{html_file['content']}]")

    # Convert the html files to markdown format
    print(f'\tConverting to markdown...')
    markdown = get_markdown_with_doc_intel(chunk['content'])
    print(f'markdown: [{markdown}]')
    title = html_file['title'].replace('.html', '')

    # Generate questions and answers pairs from the markdown content and prepare them to be salved in an Excel file
    qa_pairs = generate_answers_and_questions(title + '. ' + markdown)
    for qa in qa_pairs:
        qa_data['question'].append(qa['question'])
        qa_data['answer'].append(qa['answer'])

    # Chunk the markdown content with a maximum number of tokens and a percentage of overlapping
    chunks = chunk_with_max_tokens(markdown, 512, 0.25)

    # Prepare the list of chunks to be indexed
    for chunk in chunks:
        new_row = {title, markdown}
        all_chunks.append(new_row)

# Save questions and answers pairs in an Excel file
df = pd.DataFrame(qa_data)
qa_output_file = 'qa_pairs_2.xlsx'
df.to_excel(qa_output_file, index=False)
print(f'File {qa_output_file} saved')

# Create the index
index_name = 'project_assurance_ada_512'
create_index(index_name, embedding_model_name_ada)

# Index the chunks
index_documents(index_name, all_chunks)

# Execute the tests
for test_name, (embedding_fields, case, embbeding_model, index_name, max_retrieve, max_generate, qa_output_file) in TESTS.items():
    execute_test(test_name, embedding_fields, case, embbeding_model, index_name, max_retrieve, max_generate)