In [1]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    return

In [None]:
from azure.cosmos import CosmosClient, exceptions, PartitionKey
import json
# Define your Cosmos DB account information
endpoint = "https://anildwa-ncus-hybridsearch.documents.azure.com:443/"
key = ""

# Initialize the Cosmos client
client = CosmosClient(endpoint, key)

# Define database and container names
database_name = 'vectordb'
container_name = 'vectortest_hybridsearch'

client.create_database_if_not_exists(id=database_name)
# Connect to the database and container
database = client.get_database_client(database_name)



In [3]:
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/sectionVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        },
        {
            "path":"/keywordVector",
            "dataType":"int8",
            "distanceFunction":"cosine",
            "dimensions":1536
        },
        {
            "path":"/paraVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        },
        {
            "path":"/topicVector",
            "dataType":"int8",
            "distanceFunction":"cosine",
            "dimensions":1536
        },
        {
            "path":"/summaryVector",
            "dataType":"int8",
            "distanceFunction":"cosine",
            "dimensions":1536
        }

    ]
}


vector_indexing_policy = {
    
    "indexingMode": "consistent",
    "automatic": True,
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/_etag/?"
        },
        {
            "path": "/sectionVector/*"
        },
        {
            "path": "/keywordVector/*"
        },
        {
            "path": "/paraVector/*"
        },
        {
            "path": "/topicVector/*"
        },
        {
            "path": "/summaryVector/*"
        }
    ],
    "vectorIndexes": [
        {
            "path": "/sectionVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/keywordVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/paraVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/topicVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/summaryVector",
            "type": "quantizedFlat"
        }
    ]
}

full_text_paths_policy = {
   "defaultLanguage": "en-US",
   "fullTextPaths": [
       {
           "path": "/section_title",
           "language": "en-US"
       },
       {
           "path": "/para",
           "language": "en-US"
       },
       {
           "path": "/chapter_title",
           "language": "en-US"
       }
   ]
}




vector_indexing_policy_diskANN = {
    
    "indexingMode": "consistent",
    "automatic": True,
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/_etag/?"
        },
        {
            "path": "/sectionVector/*"
        },
        {
            "path": "/keywordVector/*"
        },
        {
            "path": "/paraVector/*"
        },
        {
            "path": "/topicVector/*"
        },
        {
            "path": "/summaryVector/*"
        }
    ],
    "fullTextIndexes": [
        {
            "path": "/section_title"
        },
        {
            "path": "/para"
        },
         {
            "path": "/chapter_title"
        }
    ],
    "vectorIndexes": [
        {
            "path": "/sectionVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/keywordVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/paraVector",
            "type": "diskANN"
        },
        {
            "path": "/topicVector",
            "type": "quantizedFlat"
        },
        {
            "path": "/summaryVector",
            "type": "quantizedFlat"
        }
    ]
}


In [4]:
for db in client.list_databases():
    print(db)

{'id': 'vectordb', '_rid': 'e7x0AA==', '_self': 'dbs/e7x0AA==/', '_etag': '"0000f138-0000-0400-0000-6740ce470000"', '_colls': 'colls/', '_users': 'users/', '_ts': 1732300359}


In [5]:
container_name = 'vectortest_hybridsearch_2partitions'

In [6]:
#database.delete_container(container=container_name)

In [7]:
print(container_name)

vectortest_hybridsearch_2partitions


In [8]:
from dataclasses import dataclass, field, asdict, is_dataclass, fields
from typing import List, Optional

@dataclass
class Element:
    """Base class for all elements."""
    id: Optional[str] = None

@dataclass
class Xref(Element):
    id: Optional[str]
    xreflabel: str = ''
    linkend: str = ''

class CustomDecoder(json.JSONDecoder):
    def __init__(self, dataclass_type=None, *args, **kwargs):
        super().__init__(object_hook=self.object_hook, *args, **kwargs)
        self.dataclass_type = dataclass_type

    def object_hook(self, obj):
        if self.dataclass_type and is_dataclass(self.dataclass_type):
            try:
                # Get dataclass field names
                field_names = {f.name for f in fields(self.dataclass_type)}
                # Filter the dictionary to match only fields in the dataclass
                filtered_obj = {k: v for k, v in obj.items() if k in field_names}
                return self.dataclass_type(**filtered_obj)
            except TypeError:
                pass  # If the object doesn't match the dataclass, return as is
        return obj

In [9]:
%%skip
vectors_file_name = 'alta_enriched_with_vectors_v2.json'
with open(vectors_file_name) as f:
    alta_enriched_with_vectors_v2 = json.load(f)


# load json file into pandas dataframe
import pandas as pd
df = pd.DataFrame(alta_enriched_with_vectors_v2)



In [10]:
container_name = 'vectortest_hybridsearch_2partitions'

In [11]:
%%skip
container = database.create_container(id=container_name, partition_key=PartitionKey(path="/book_title"), 
                          vector_embedding_policy=vector_embedding_policy,
                          indexing_policy=vector_indexing_policy_diskANN,
                          full_text_policy=full_text_paths_policy,
                          offer_throughput=10000) 

In [12]:
%%skip
for doc in alta_enriched_with_vectors_v2:
    try:
        
        xrefs_json = doc['xrefs']
        xrefs_list = json.loads(xrefs_json) 
        doc['xrefs'] = xrefs_list
        doc_serialized = json.loads(json.dumps(doc))
        res = container.upsert_item(doc_serialized)

        #res = container.upsert_item(doc)
        #print(res)
        #print(f"Document with id {doc['id']} added successfully.")
    except Exception as e:
        print(f"Failed to insert document {doc['id']}: {e}")

In [13]:
import openai_helper
search_query = "how to configure a policy to archive the archive logs in DB2 and what is the additional configuration needed if netbackup installed on more than one node in the DB2 cluster?"
search_query2 = "how to enable a policy to archive the logs in DB2 and give me more information on user archive schedules?"
search_query_embedded = openai_helper.generate_embeddings([search_query], 1536)
#search_query_embedded[0].embedding

search_query2_embedded = openai_helper.generate_embeddings([search_query2], 1536)
#search_query2_embedded[0].embedding


In [14]:
search_query_arr = search_query.split(" ")
search_query2_arr = search_query2.split(" ")


In [15]:
container_name_multipartition = 'vectortest_hybridsearch'
container_multipartition = database.get_container_client(container_name_multipartition)
container_name_2partition = 'vectortest_hybridsearch_2partitions'
container_2partition = database.get_container_client(container_name_2partition)

## Full Text Search

In [16]:
%%skip
query_string = f"""
SELECT TOP 20 c.section_title
FROM c
ORDER BY RANK FullTextScore(c.para, {search_query_arr})
"""

items = container_multipartition.query_items( 
 query=query_string, 
parameters=[], 
 enable_cross_partition_query=True)

for item in items:
    print(item)

In [17]:
%%skip
query_string = f"""
SELECT TOP 20 c.section_title
FROM c
ORDER BY RANK FullTextScore(c.para, {search_query_arr})
"""

items = container_2partition.query_items( 
 query=query_string, 
parameters=[], 
 enable_cross_partition_query=True)

for item in items:
    print(item)

## Single Vector Search

In [18]:
%%skip
items = container_2partition.query_items( 
 query='SELECT top 20 c.section_title, VectorDistance(c.paraVector, @embedding) AS paraSimilarityScore FROM c', 
parameters=[ 
  {"name": "@embedding", "value": search_query_embedded[0].embedding} 
 ], 
 enable_cross_partition_query=True)


sorted_items = sorted(items, key=lambda x: x['paraSimilarityScore'], reverse=True)

# Display sorted items
for item in sorted_items:
  print(item['section_title'], item['paraSimilarityScore'])
  print("----------------------------------------------------")

## Hybrid Search
Single Vector + Full Text Search

In [19]:
%%skip
query_string = f"""
SELECT TOP 20 c.section_title
FROM c
ORDER BY RANK RRF(VectorDistance(c.paraVector, {search_query_embedded[0].embedding}), FullTextScore(c.para, {search_query_arr}))
"""

items = container_2partition.query_items( 
 query=query_string, 
parameters=[], 
 enable_cross_partition_query=True)

for item in items:
    print(item)

## Hybrid Search
Multi Vector Search + Full Text Search

In [53]:
query_string = f"""
SELECT TOP 5 c.section_id, c.section_title, c.para, c.xrefs
FROM c
ORDER BY RANK RRF(VectorDistance(c.paraVector, {search_query2_embedded[0].embedding}), 
VectorDistance(c.summaryVector, {search_query2_embedded[0].embedding}), 
VectorDistance(c.sectionVector, {search_query2_embedded[0].embedding}),
VectorDistance(c.topicVector, {search_query2_embedded[0].embedding}),
VectorDistance(c.keywordVector, {search_query2_embedded[0].embedding}),
FullTextScore(c.para, {search_query2_arr}))
"""

items = container_2partition.query_items( 
 query=query_string, 
parameters=[], 
 enable_cross_partition_query=True)

#for item in items:
#    print(item)

context = ""

for item in items:
    
    context += f"""Main Section:
    Section Title: {item['section_title']}\n
    Section ID: {item['section_id']}\n
    Paragraph: {item['para']}\n
    Reference Sections: \n
    """

    doc = json.loads(json.dumps(item))
    
    if 'xrefs' in doc:
        xrefs = doc['xrefs']
        xrefs_list = json.loads(xrefs)
        xrefs = [Xref(**xref) for xref in xrefs_list]
        doc['xrefs'] = xrefs
        if len(doc['xrefs']) > 0:
            
            linkend_query_string = ""
            for xref in doc['xrefs']:
                linkend_query_string += f"'{(xref.linkend)}',"
            
            linkend_query_string = f"SELECT c.section_title, c.section_id, c.para from c WHERE c.section_id IN ({linkend_query_string.rstrip(',')})"
            linked_items = container_2partition.query_items( 
                query=linkend_query_string, 
                parameters=[], 
                enable_cross_partition_query=True)
            for linked_item in linked_items:
                #print(f"Main Context: {context}")
                context += f"""Section Title: {linked_item['section_title']}\n
                Section ID: {linked_item['section_id']}\n
                Paragraph: {linked_item['para']}\n
                """
            
                
            

print(context)


Main Section:
    Section Title: Backing up the archive logs that are in a DB2 EEE environment

    Section ID: v24410043

    Paragraph: The policy you use to back up the archive logs depends on the method you use for log archiving. If you use the user exit program, create a Standard policy. If you use the VENDOR method, you can use the DB2 Application Backup schedule.

    Reference Sections: 

    Main Section:
    Section Title: Configuring a policy to archive the archive logs

    Section ID: id-SF0A0118459

    Paragraph: This topic describes how to create a policy to archive the NetBackup DB2 archive log entries in the ARCDIR directory. Follow these instructions if you want to use the user exit program with the ARCFUNC COPY command.When NetBackup performs an archive, it deletes the online files after they are backed up successfully.For more information on user archive schedules, see the NetBackup Administrator's Guide, Volume I.You do not need to perform this procedure if you us

ModuleNotFoundError: No module named 'cosmosdb_helper'

In [149]:
items[0]

TypeError: 'ItemPaged' object is not subscriptable

## Hybrid search with nested fields

In [73]:
query_string = f"""
SELECT TOP 200
c.section_title, linked_para
FROM c
JOIN x IN c.xrefs
JOIN (SELECT x.para from x where x.linkend = c.id) as linked_para


ORDER BY RANK RRF(VectorDistance(c.paraVector, {search_query_embedded[0].embedding}), 
VectorDistance(c.summaryVector, {search_query_embedded[0].embedding}), 
VectorDistance(c.sectionVector, {search_query_embedded[0].embedding}),
VectorDistance(c.topicVector, {search_query_embedded[0].embedding}),
VectorDistance(c.keywordVector, {search_query_embedded[0].embedding}),
FullTextScore(c.para, {search_query_arr}))
"""

items = container_2partition.query_items( 
 query=query_string, 
parameters=[], 
 enable_cross_partition_query=True)

for item in items:
    print(item)

CosmosHttpResponseError: (BadRequest) One of the input values is invalid.
ActivityId: dc705af2-67d3-4a23-83e7-8afaf05f684e, Windows/10.0.20348 cosmos-netstandard-sdk/3.18.0
Code: BadRequest
Message: One of the input values is invalid.
ActivityId: dc705af2-67d3-4a23-83e7-8afaf05f684e, Windows/10.0.20348 cosmos-netstandard-sdk/3.18.0

## Multi Vector Search

In [46]:
# Define weights for each score (adjust as necessary)
para_weight = 1.0 #0.7
section_weight = 1.0  #0.3

# Fetch and sort items using a combined score
items = list(container.query_items( 
    query='SELECT TOP 10 c.section_title, c.para, VectorDistance(c.paraVector, @embedding) AS paraSimilarityScore, VectorDistance(c.sectionVector, @embedding) AS sectionSimilarityScore FROM c', 
    parameters=[ 
        {"name": "@embedding", "value": search_query_embedded[0].embedding} 
    ], 
    enable_cross_partition_query=True
))

# Calculate combined score and sort
for item in items:
    item['combinedScore'] = para_weight * item['paraSimilarityScore'] + section_weight * item['sectionSimilarityScore']


# Sort items by the combined score
sorted_items = sorted(items, key=lambda x: x['combinedScore'], reverse=True)

for item in sorted_items:
  print(item['section_title'])
  print("************")
  print(item['para'])
  print("----------------------------------------------------")
# Display sorted items
#for item in sorted_items:
#    print(item['para'], item['paraSimilarityScore'], item['sectionSimilarityScore'], item['combinedScore'])



About NetBackup
************
NetBackup provides a complete, flexible data protection solution for a variety of platforms. The platforms include Windows, UNIX, and Linux systems.NetBackup administrators can set up periodic or calendar-based schedules to perform automatic, unattended backups for clients across a network. An administrator can carefully schedule backups to achieve systematic and complete backups over a period of time, and optimize network traffic during off-peak hours. The backups can be full or incremental: Full backups back up all indicated client files, while incremental backups back up only the files that have changed since the last backup.The NetBackup administrator can allow users to back up, restore, or archive the files from their computer. (An archive operation backs up a file, then deletes it from the local disk if the backup is successful.)Server software resides on the computer that manages the storage devices.Client software resides on computers that contain d

In [None]:

# Query for items 
for item in container.query_items( 
    query='SELECT c.para, VectorDstance(c.paraVector,@embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.paraVector,@embedding)', 
 parameters=[ 
  {"name": "@embedding", "value": search_query_embedded[0].embedding} 
 ],
 enable_cross_partition_query=True):
    print(json.dumps(item, indent=True))

CosmosHttpResponseError: (BadRequest) One of the input values is invalid.
ActivityId: 565f9767-701e-47fd-9ecc-738f5cdefef7, Windows/10.0.20348 cosmos-netstandard-sdk/3.18.0
Code: BadRequest
Message: One of the input values is invalid.
ActivityId: 565f9767-701e-47fd-9ecc-738f5cdefef7, Windows/10.0.20348 cosmos-netstandard-sdk/3.18.0