In [1]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
)
from azure.search.documents.models import VectorizedQuery
from openai import AzureOpenAI
import os
import numpy as np
from dotenv import load_dotenv
import json
load_dotenv()  # Load variables from .env file

True

### Upload to blob

In [4]:
def get_blob_client():
    connection_string = str(os.getenv("BLOB_CONNECTION_STRING"))
    return BlobServiceClient.from_connection_string(connection_string)

In [5]:
def get_blob_container_client(container_name, blob_service_client):
    # get the container client
    return blob_service_client.get_container_client(container_name)

In [6]:
def list_blob_containers():
    blob_service_client = get_blob_client()
    # List all containers in the storage account
    containers = blob_service_client.list_containers()
    for container in containers:
        print(container['name'])

In [7]:
list_blob_containers()

docs


In [8]:
def blob_upload(local_file_path):
    # get the blob client
    blob_service_client = get_blob_client()
    # set container name
    container_name = "docs"
    # get the container client
    container_client = get_blob_container_client(container_name=container_name,blob_service_client=blob_service_client)
    # local_file_path = "docs/test_ima_1.pdf"
    #get the filename
    blob_name = os.path.basename(local_file_path)
    # uplaod
    with open(local_file_path, "rb") as data:
        container_client.upload_blob(name=blob_name, data=data, overwrite=True)

    print(f"Uploaded {blob_name} to {container_name}")

### Download File from blob

In [9]:
# Download the blob to a local file
def blob_download(filename):
    # get the blob client
    blob_service_client = get_blob_client()
    # get the container client
    container_client = get_blob_container_client(container_name=container_name,blob_service_client=blob_service_client)
    with open(f"downloads/{filename}", "wb") as download_file:
        download_blob = container_client.download_blob(filename)
        download_blob.readinto(download_file)

In [200]:
blob_download('test_ima.pdf')

### Use Document Intelligence to get text from PDF

In [2]:
def get_document_intelligence_client():
    endpoint = str(os.getenv("DOC_INT_ENDPOINT"))
    key = str(os.getenv("DOC_INT_KEY"))
    return DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )


In [25]:
def extract_document_structure(local_path):
    document_analysis_client = get_document_intelligence_client()
    with open(local_path, "rb") as f:
        # poller = document_analysis_client.begin_analyze_document(
        #     "prebuilt-document", document=f
        # )
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f
        )
    result = poller.result()

    # document_text = ""
    # for page in result.pages:
    #     for line in page.lines:
    #         document_text += line.content + "\n"

    # return document_text
    return result

In [19]:
print(extract_document_structure('docs/test_ima.pdf'))

THE
INVESTMENT
ASSOCIATION
NORTON ROSE FULBRIGHT
MODEL
DISCRETIONARY
INVESTMENT
MANAGEMENT
AGREEMENT
Published by The Investment Association
in cooperation with Norton Rose Fulbright LLP
November 2021
THE INVESTMENT ASSOCIATION
The Investment Association
Camomile Court, 23 Camomile Street, London, EC3A 7LL
www.theia.org
@InvAssoc
November 2021
The Investment Association (2021). All rights reserved.
No reproduction without permission of The Investment Association.
2
MODEL DISCRETIONARY INVESTMENT MANAGEMENT AGREEMENT
The copyright in this model investment management agreement template (the "Model IMA") is owned by the
Investment Association (the "IA").
The IA has produced this Model IMA in conjunction with members from a broad selection of the IA's member
firms, both full and associate members, and it is intended to represent the views of all types and sizes of
asset and fund management firms. The working group included representatives from dealing, compliance,
legal, operational and pr

### Embedding the text using ada-002

In [9]:
def get_embedding_model_client():
    # Azure OpenAI credentials
    ada_endpoint = str(os.getenv("ADA_ENDPOINT"))
    ada_key = str(os.getenv("ADA_KEY"))
    # Initialize Azure OpenAI client
    return AzureOpenAI(
        azure_endpoint=ada_endpoint,
        api_key=ada_key,
        api_version="2023-05-15"
    )

In [10]:
def create_embeddings(text, max_tokens=8191):
    ada_deployment = "text-embedding-ada-002"
    
    # Split the text into chunks if it exceeds the max token limit
    chunks = [text[i:i+max_tokens] for i in range(0, len(text), max_tokens)]
    openai_client = get_embedding_model_client()
    embeddings = []
    for chunk in chunks:
        response = openai_client.embeddings.create(
            input=chunk,
            model=ada_deployment
        )
        embedding = response.data[0].embedding
        embeddings.append(embedding)
    
    # If there are multiple chunks, average the embeddings
    if len(embeddings) > 1:
        return np.mean(embeddings, axis=0).tolist()
    else:
        return embeddings[0]

In [11]:
# Create embeddings
embeddings = create_embeddings(extract_document_text("docs/test_ima_1.pdf"))

print(f"Embedding dimension: {len(embeddings)}")
print(f"First few values of the embedding: {embeddings[:5]}")

Embedding dimension: 1536
First few values of the embedding: [-0.011320868002561232, -0.02466426727672418, -0.008200732214997212, -0.042265833665927253, -0.030091070259610813]


### AI Search/Cognitive Search : Create an Index

In [33]:
def get_ai_search_index_client():
    admin_key = str(os.getenv("AI_SEARCH_KEY"))
    endpoint = str(os.getenv("AI_SEARCH_ENDPOINT"))
    if not admin_key or not endpoint:
        raise ValueError("AI_SEARCH_KEY or AI_SEARCH_ENDPOINT environment variables are not set")
    
    print(f"Connecting to: {endpoint}")
    try:
        return SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(admin_key))
    except Exception as e:
        print(f"Error creating SearchIndexClient: {str(e)}")
        raise

In [92]:
def create_vector_index(index_name):
    index_client = get_ai_search_index_client()
    # predefined and cant change for ada-002
    ada_embedding_dimensions =1536
    # Define the index schema
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="title", type=SearchFieldDataType.String),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="category", type=SearchFieldDataType.String,
                        filterable=True),
        SearchableField(name="link", type=SearchFieldDataType.String,
                        filterable=True),
        SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=ada_embedding_dimensions,vector_search_profile_name="myHnswProfile"),
        SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=ada_embedding_dimensions,vector_search_profile_name="myHnswProfile"),
    ]
    # Configure the vector search configuration  
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw"
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                # vectorizer="myVectorizer"
            )
        ],
        # vectorizers=[
        #     AzureOpenAIVectorizer(
        #         name="myVectorizer",
        #         azure_open_ai_parameters=AzureOpenAIParameters(
        #             resource_uri=ada_endpoint,
        #             deployment_id=ada_deployment,
        #             model_name=ada_deployment,
        #             api_key=ada_key
        #         )
        #     )
        # ]
    )
    
    semantic_config = SemanticConfiguration(
        name="my-semantic-config",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            keywords_fields=[SemanticField(field_name="category")],
            content_fields=[SemanticField(field_name="content")]
        )
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index with the semantic settings
    index = SearchIndex(name=index_name, fields=fields,
                        vector_search=vector_search, semantic_search=semantic_search)
    result = index_client.create_or_update_index(index)
    print(f'{result.name} created')

In [93]:
create_vector_index("legal-docs")

Connecting to: https://aisearchpoc0924.search.windows.net
legal-docs created


### AI Search/Cognitive Search : Uploading to an index

In [40]:
def get_ai_search_client():
    # Initialize Search client
    admin_key = str(os.getenv("AI_SEARCH_KEY"))
    endpoint = str(os.getenv("AI_SEARCH_ENDPOINT"))
    index_name = 'legal-docs'
    return SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(admin_key))

In [41]:
def upload_to_vector_index(documents):
    search_client = get_ai_search_client()
    # Upload documents to the index
    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(result)} documSents")

In [94]:
def semantic_chunking_with_overlap(document_result, overlap=100):
    chunks = []
    for page_num, page in enumerate(document_result.pages, start=1):
        current_chunk = ""
        for line in page.lines:
            if len(current_chunk) + len(line.content) > 1000:
                chunks.append((page_num, current_chunk.strip()))
                current_chunk = current_chunk[-overlap:]
            current_chunk += line.content + " "
        if current_chunk:
            chunks.append((page_num, current_chunk.strip()))
    return chunks

In [95]:
def process_document(local_path, doc_id, title, link):
    document_result = extract_document_structure(local_path)
    chunks = semantic_chunking_with_overlap(document_result)
    
    documents = []
    for i, (page_num, chunk) in enumerate(chunks):
        chunk_id = f"{doc_id}-chunk-{i}"
        documents.append({
            "id": chunk_id,
            "title": f"{title} - Page {page_num}",
            "content": chunk,
            "category": "IMA",
            "link": link,
            "titleVector": create_embeddings(f"{title} - Page {page_num}"),
            "contentVector": create_embeddings(chunk)
        })
    return documents

In [96]:
# Process and upload documents
documents1 = process_document("docs/test_ima.pdf", "1", "Sample IMA 1","https://www.google.com/")
documents2 = process_document("docs/test_ima_1.pdf", "2", "Sample IMA 2","https://www.youtube.com/")
all_documents = documents1 + documents2
# upload_to_vector_index(all_documents)
# # Prepare and index your documents
# document_text1 = extract_document_text(local_path="docs/test_ima.pdf")
# document_text2 = extract_document_text(local_path="docs/test_ima_1.pdf")
# documents = [
#     {
#         "id": "1",
#         "title": "Sample IMA 1",
#         "content": document_text1,
#         "category": "IMA",
#         "titleVector": create_embeddings("Sample IMA 1"),
#         "contentVector": create_embeddings(document_text1)
#     },
#     {
#         "id": "2",
#         "title": "Sample IMA 2",
#         "content": document_text2,
#         "category": "IMA",
#         "titleVector": create_embeddings("Sample IMA 2"),
#         "contentVector": create_embeddings(document_text2)
#     }
#     # Add more documents as needed
# ]


In [97]:
upload_to_vector_index(all_documents)

Uploaded 218 documSents


### AI Search/Cognitive Search : Searching an index

In [98]:
def search_vector_index(query,doc_id=2):
    embedding  = create_embeddings(query)
    vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="titleVector")
    search_client = get_ai_search_client()
    # Create a filter to restrict search to a specific document ID
    # Corrected filter expression
    # filter_expression = f"id eq '{doc_id}' or id startswith '{doc_id}-chunk-'"
    #perform vector search
    return search_client.search(  
        search_text=query,  
        vector_queries= [vector_query],
        select=["title", "content", "category","link"],
        # filter=filter_expression
    )  

In [99]:
results = search_vector_index("force majeure")
for result in results:  
    print(result.keys())
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}")  
    print(f"link: {result['link']}\n")  

dict_keys(['category', 'link', 'content', 'title', '@search.score', '@search.reranker_score', '@search.highlights', '@search.captions'])
Title: Sample IMA 1 - Page 31
Score: 0.01666666753590107
Content: e Client as considered appropriate, including through the Manager's website, by email or otherwise. [The Client hereby consents to receiving communications and reports under this Agreement (including but not limited to valuations) electronically online via the Manager's secure client website ([insert website link]).] The Client understands that if documents are only available online the Client will not receive a printed version. A paper copy of such communications and reports will be available to the Client upon request. 32. Force Majeure No Party to this Agreement shall be liable for any failure or delay in performing any of its obligations under or pursuant to this Agreement, and any such failure or delay in performing its obligations will not constitute a breach of this Agreement, if

### Using the vector search results as context to the prompt and returning output using GPT 4o

In [51]:
def get_relevant_context(query):
    # Perform vector search
    results = search_vector_index(query)
    # Extract and return relevant context as JSON objects
    context = [
        {
            "title": result['title'],
            "category": result['category'],
            "content": result['content'],
            "link": result['link']
        } for result in results
    ]
    return context

In [52]:
def get_gpt4o_client():
    # Azure OpenAI credentials
    gpt4o_endpoint = os.getenv("GPT4O_ENDPOINT")
    gpt4o_key = os.getenv("GPT4O_KEY")
    deployment_name = "gpt-4o"
    
    # Initialize Azure OpenAI client
    client = AzureOpenAI(
        api_key=gpt4o_key,
        api_version="2023-05-15",
        azure_endpoint=gpt4o_endpoint
    )
    
    return client, deployment_name

In [59]:
def generate_response(query, context):
    context_str = json.dumps(context, indent=2)
    prompt = f"""Use the following information to answer the user's question. 
    If the information doesn't contain the answer, say you don't know.

    Context (JSON format):
    {context_str}

    User's question: {query}

    Provide a concise answer based on the given context. If relevant, mention which document(s) 
    (by title or category) the information comes from and cite the link as well.
    Cite the actual words from the document as well.
    """
    openai_client,deployment_name = get_gpt4o_client()
    response = openai_client.chat.completions.create(
        model=deployment_name,  # Your deployed GPT-4o model name
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.content

In [64]:
query = "which documents contain the force majeure clause"
context = get_relevant_context(query)


In [65]:
response = generate_response(query,context)
print(response)

The force majeure clause is mentioned in the following documents:

1. **Sample IMA 1 - Part 104**:
    - "No Party to this Agreement shall be liable for any failure or delay in performing any of its obligations under or pursuant to this Agreement, and any such failure or delay in performing its obligations will not constitute a breach of this Agreement, if and to the extent that such failure or delay is due to an event of Force Majeure."
    - [Link](https://www.google.com/)
   
2. **Sample IMA 1 - Part 16**:
    - "Force Majeure\" means any event preventing either of the Parties from performing any or all of its obligations under this Agreement which arises from or is attributable to acts, events, omissions or accidents beyond the reasonable control of the Party so prevented, including, without limitation, nationalisation, expropriation or"
    - [Link](https://www.google.com/)
   
3. **Sample IMA 1 - Part 24**:
    - "Termination Event\" means the occurrence at any time with respect 