In [36]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
)
from azure.search.documents.models import VectorizedQuery
from openai import AzureOpenAI
import os
import numpy as np
from dotenv import load_dotenv
import json
load_dotenv()  # Load variables from .env file

True

### Upload to blob

In [4]:
def get_blob_client():
    connection_string = str(os.getenv("BLOB_CONNECTION_STRING"))
    return BlobServiceClient.from_connection_string(connection_string)

In [5]:
def get_blob_container_client(container_name, blob_service_client):
    # get the container client
    return blob_service_client.get_container_client(container_name)

In [6]:
def list_blob_containers():
    blob_service_client = get_blob_client()
    # List all containers in the storage account
    containers = blob_service_client.list_containers()
    for container in containers:
        print(container['name'])

In [7]:
list_blob_containers()

docs


In [8]:
def blob_upload(local_file_path):
    # get the blob client
    blob_service_client = get_blob_client()
    # set container name
    container_name = "docs"
    # get the container client
    container_client = get_blob_container_client(container_name=container_name,blob_service_client=blob_service_client)
    # local_file_path = "docs/test_ima_1.pdf"
    #get the filename
    blob_name = os.path.basename(local_file_path)
    # uplaod
    with open(local_file_path, "rb") as data:
        container_client.upload_blob(name=blob_name, data=data, overwrite=True)

    print(f"Uploaded {blob_name} to {container_name}")

### Download File from blob

In [9]:
# Download the blob to a local file
def blob_download(filename):
    # get the blob client
    blob_service_client = get_blob_client()
    # get the container client
    container_client = get_blob_container_client(container_name=container_name,blob_service_client=blob_service_client)
    with open(f"downloads/{filename}", "wb") as download_file:
        download_blob = container_client.download_blob(filename)
        download_blob.readinto(download_file)

In [200]:
blob_download('test_ima.pdf')

### Use Document Intelligence to get text from PDF

In [10]:
def get_document_intelligence_client():
    endpoint = str(os.getenv("DOC_INT_ENDPOINT"))
    key = str(os.getenv("DOC_INT_KEY"))
    return DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )


In [11]:
def extract_document_text(local_path):
    document_analysis_client = get_document_intelligence_client()
    with open(local_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-document", document=f
        )
    result = poller.result()

    document_text = ""
    for page in result.pages:
        for line in page.lines:
            document_text += line.content + "\n"

    return document_text

In [12]:
# extract_document_text('docs/test_ima.pdf')



### Embedding the text using ada-002

In [13]:
def get_embedding_model_client():
    # Azure OpenAI credentials
    ada_endpoint = str(os.getenv("ADA_ENDPOINT"))
    ada_key = str(os.getenv("ADA_KEY"))
    # Initialize Azure OpenAI client
    return AzureOpenAI(
        azure_endpoint=ada_endpoint,
        api_key=ada_key,
        api_version="2023-05-15"
    )

In [14]:
def create_embeddings(text, max_tokens=8191):
    ada_deployment = "text-embedding-ada-002"
    
    # Split the text into chunks if it exceeds the max token limit
    chunks = [text[i:i+max_tokens] for i in range(0, len(text), max_tokens)]
    openai_client = get_embedding_model_client()
    embeddings = []
    for chunk in chunks:
        response = openai_client.embeddings.create(
            input=chunk,
            model=ada_deployment
        )
        embedding = response.data[0].embedding
        embeddings.append(embedding)
    
    # If there are multiple chunks, average the embeddings
    if len(embeddings) > 1:
        return np.mean(embeddings, axis=0).tolist()
    else:
        return embeddings[0]

In [15]:
# Create embeddings
embeddings = create_embeddings(extract_document_text("docs/test_ima_1.pdf"))

print(f"Embedding dimension: {len(embeddings)}")
print(f"First few values of the embedding: {embeddings[:5]}")

Embedding dimension: 1536
First few values of the embedding: [-0.011320868002561232, -0.02466426727672418, -0.008200732214997212, -0.042265833665927253, -0.030091070259610813]


### AI Search/Cognitive Search : Create an Index

In [16]:
def get_ai_search_index_client():
    admin_key = str(os.getenv("AI_SEARCH_KEY"))
    endpoint = str(os.getenv("AI_SEARCH_ENDPOINT"))
    if not admin_key or not endpoint:
        raise ValueError("AI_SEARCH_KEY or AI_SEARCH_ENDPOINT environment variables are not set")
    
    print(f"Connecting to: {endpoint}")
    try:
        return SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(admin_key))
    except Exception as e:
        print(f"Error creating SearchIndexClient: {str(e)}")
        raise

In [17]:
def create_vector_index(index_name):
    index_client = get_ai_search_index_client()
    # predefined and cant change for ada-002
    ada_embedding_dimensions =1536
    # Define the index schema
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="title", type=SearchFieldDataType.String),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="category", type=SearchFieldDataType.String,
                        filterable=True),
        SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=ada_embedding_dimensions,vector_search_profile_name="myHnswProfile"),
        SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=ada_embedding_dimensions,vector_search_profile_name="myHnswProfile"),
    ]
    # Configure the vector search configuration  
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw"
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                # vectorizer="myVectorizer"
            )
        ],
        # vectorizers=[
        #     AzureOpenAIVectorizer(
        #         name="myVectorizer",
        #         azure_open_ai_parameters=AzureOpenAIParameters(
        #             resource_uri=ada_endpoint,
        #             deployment_id=ada_deployment,
        #             model_name=ada_deployment,
        #             api_key=ada_key
        #         )
        #     )
        # ]
    )
    
    semantic_config = SemanticConfiguration(
        name="my-semantic-config",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            keywords_fields=[SemanticField(field_name="category")],
            content_fields=[SemanticField(field_name="content")]
        )
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index with the semantic settings
    index = SearchIndex(name=index_name, fields=fields,
                        vector_search=vector_search, semantic_search=semantic_search)
    result = index_client.create_or_update_index(index)
    print(f'{result.name} created')

In [24]:
create_vector_index("legal-docs")

Connecting to: https://aisearchpoc0924.search.windows.net
legal-docs created


### AI Search/Cognitive Search : Uploading to an index

In [18]:
def get_ai_search_client():
    # Initialize Search client
    admin_key = str(os.getenv("AI_SEARCH_KEY"))
    endpoint = str(os.getenv("AI_SEARCH_ENDPOINT"))
    index_name = 'legal-docs'
    return SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(admin_key))

In [19]:
def upload_to_vector_index(documents):
    search_client = get_ai_search_client()
    # Upload documents to the index
    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(result)} documSents")

In [27]:
# Prepare and index your documents
document_text1 = extract_document_text(local_path="docs/test_ima.pdf")
document_text2 = extract_document_text(local_path="docs/test_ima_1.pdf")
documents = [
    {
        "id": "1",
        "title": "Sample IMA 1",
        "content": document_text1,
        "category": "IMA",
        "titleVector": create_embeddings("Sample IMA 1"),
        "contentVector": create_embeddings(document_text1)
    },
    {
        "id": "2",
        "title": "Sample IMA 2",
        "content": document_text2,
        "category": "IMA",
        "titleVector": create_embeddings("Sample IMA 2"),
        "contentVector": create_embeddings(document_text2)
    }
    # Add more documents as needed
]


In [28]:
upload_to_vector_index(documents)

Uploaded 2 documSents


### AI Search/Cognitive Search : Searching an index

In [24]:
def search_vector_index(query):
    embedding  = create_embeddings(query)
    vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="titleVector")
    search_client = get_ai_search_client()
    #perform vector search
    return search_client.search(  
        search_text=query,  
        vector_queries= [vector_query],
        select=["title", "content", "category"],
    )  

In [25]:
results = search_vector_index("authorized person")
for result in results:  
    print(result.keys())
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    #print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

dict_keys(['content', 'title', 'category', '@search.score', '@search.reranker_score', '@search.highlights', '@search.captions'])
Title: Sample IMA 1
Score: 0.03306011110544205
Category: IMA

dict_keys(['content', 'title', 'category', '@search.score', '@search.reranker_score', '@search.highlights', '@search.captions'])
Title: Sample IMA 2
Score: 0.03306011110544205
Category: IMA



In [27]:
def get_relevant_context(query):
    # Perform vector search
    results = search_vector_index(query)
    # Extract and return relevant context as JSON objects
    context = [
        {
            "title": result['title'],
            "category": result['category'],
            "content": result['content']
        } for result in results
    ]
    return context

In [30]:
def get_gpt4o_client():
    # Azure OpenAI credentials
    gpt4o_endpoint = os.getenv("GPT4O_ENDPOINT")
    gpt4o_key = os.getenv("GPT4O_KEY")
    deployment_name = "gpt-4o"
    
    # Initialize Azure OpenAI client
    client = AzureOpenAI(
        api_key=gpt4o_key,
        api_version="2023-05-15",
        azure_endpoint=gpt4o_endpoint
    )
    
    return client, deployment_name

In [38]:
def generate_response(query, context):
    context_str = json.dumps(context, indent=2)
    prompt = f"""Use the following information to answer the user's question. 
    If the information doesn't contain the answer, say you don't know.

    Context (JSON format):
    {context_str}

    User's question: {query}

    Provide a concise answer based on the given context. If relevant, mention which document(s) 
    (by title or category) the information comes from.
    """
    openai_client,deployment_name = get_gpt4o_client()
    response = openai_client.chat.completions.create(
        model=deployment_name,  # Your deployed GPT-4o model name
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.content

In [39]:
query = "authorized person"
context = get_relevant_context(query)


In [40]:
response = generate_response(query,context)
print(response)

An **"Authorised Person"** refers to an individual who is designated with authority to give instructions on behalf of the client. This individual's name, details, and signature are specified in the documentation related to the investment management agreement.

**In the context of "Sample IMA 1"**:
The details of the Authorised Person appear in **Schedule 1**, and changes to the list of Authorised Persons can be made by the client by giving notice to the Manager.

**In the context of "Sample IMA 2"**:
Although "Sample IMA 2" discusses client instructions and communication, it does not explicitly define an "Authorised Person" as outlined in "Sample IMA 1."

In summary, for specific details related to an "Authorised Person," refer to **Sample IMA 1**.
