In [1]:
%%capture
pip install docling pymilvus ipywidgets

In [2]:
%%capture
pip install "pymilvus[model]"

In [13]:
from docling.document_converter import DocumentConverter
from pymilvus import MilvusClient
from pymilvus import connections
from pymilvus import model
from docling.chunking import HybridChunker

In [4]:
client = MilvusClient("http://vectordb-milvus.milvus.svc.cluster.local:19530", user="root", password="Milvus")

In [6]:
# Variable for collection name
collection_name = "openshift_ai_documentation"

# Delete collection if the collection exists
if client.has_collection(collection_name=collection_name):
    print("going to delete ", collection_name)
    client.drop_collection(collection_name=collection_name)

# Create collection
print("Creating Collection ", collection_name)   
client.create_collection(
    collection_name=collection_name,
    dimension=768,  # The vectors we will use in this demo has 768 dimensions
)

Creating Collection  openshift_ai_documentation


In [7]:
# Define embedding model
embedding_fn = model.DefaultEmbeddingFunction()


tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/46.9M [00:00<?, ?B/s]

# Fetching data

In [8]:
from urllib.parse import urlparse
def get_file_name_from_url(url):
    # Parse the URL to extract the path
    parsed_url = urlparse(url)
    # Extract the file name from the path
    file_name = parsed_url.path.split('/')[-1]
    
    return file_name

In [9]:
def get_metadata_from_filename(filename):
    metadata = filename.split("-")
    return {
            "product_name": metadata[0],
            "version": metadata[2],
            "section": metadata[3],
            "language": metadata[4]
        }

In [23]:
base_url="https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/pdf/"
source_urls=[base_url + "monitoring_data_science_models/Red_Hat_OpenShift_AI_Self-Managed-2.16-Monitoring_data_science_models-en-US.pdf",
              base_url + "release_notes/Red_Hat_OpenShift_AI_Self-Managed-2.16-Release_notes-en-US.pdf",]

chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5")
converter = DocumentConverter()

print("CAUTION: MAX FILE URLS EQUALS 100")

## Define Empty Vector Array
vectors = []

for file_index,file in enumerate(source_urls):
    ## Retrieve metadata from one file
    metadata = get_metadata_from_filename(get_file_name_from_url(file))
    print(metadata)
    
    ## Parse document from source chunk it
    converted_source_file = converter.convert(file)
    document = converted_source_file.document
    chunk_iter = chunker.chunk(document)
    ## Create chunk_list with the parts of the document
    chunk_list = list(chunk_iter)


    chunk_vectors = embedding_fn.encode_documents([chunk.text for chunk in chunk_list])


    for i, chunk in enumerate(chunk_list):
        vectors.append({
            "id": int(str(file_index * 100) + str(i)), 
            "product_name": metadata.get("product_name", "null"),
            "version": metadata.get("version", "null"),
            "section": metadata.get("section", "null"),
            "language": metadata.get("language", "null"),
            "vector": chunk_vectors[i] , 
            "text": chunk.text,
        })




print(vectors[0])


CAUTION: MAX FILE URLS EQUALS 100
{'product_name': 'Red_Hat_OpenShift_AI_Self', 'version': '2.16', 'section': 'Monitoring_data_science_models', 'language': 'en'}


Token indices sequence length is longer than the specified maximum sequence length for this model (925 > 512). Running this sequence through the model will result in indexing errors


{'product_name': 'Red_Hat_OpenShift_AI_Self', 'version': '2.16', 'section': 'Release_notes', 'language': 'en'}
{'id': 0, 'product_name': 'Red_Hat_OpenShift_AI_Self', 'version': '2.16', 'section': 'Monitoring_data_science_models', 'language': 'en', 'vector': array([ 7.88867869e-03,  4.08021641e-03, -1.15342606e-02, -2.14147612e-03,
       -2.59131469e-02, -5.34342581e-02, -1.83294431e-02,  5.29256430e-02,
        3.46607589e-02,  9.57935971e-03,  2.97208857e-03, -5.87183352e-02,
        1.08694295e-02,  2.84521404e-02,  1.76512915e-02,  2.77943203e-02,
       -4.45831762e-02,  2.01048377e-02,  3.66864159e-02, -4.36195331e-02,
       -1.00510664e-01, -3.28423994e-03, -1.59814072e-03, -2.10822848e-02,
        3.81783612e-02, -2.02231558e-02, -1.16402697e-02,  6.59706410e-02,
       -3.74384203e-03, -1.25397937e-02,  1.76071698e-02, -4.17454179e-04,
        8.94432823e-03, -4.55577667e-02, -3.81062838e-03, -1.45016591e-03,
        1.06563433e-03,  3.82353308e-02, -1.75184451e-02,  3.304547

# Insert File Data

In [24]:
# Insert data
res = client.insert(collection_name=collection_name, data=vectors)

# Check Output
print(res)

{'insert_count': 286, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 10010, 10011, 10012, 10013, 10014, 10015, 10016, 10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024, 10025, 10026, 10027, 10028, 10029, 10030, 10031, 10032, 10033, 10034, 10035, 10036, 10037, 10038, 10039, 10040, 10041, 10042, 10043, 10044, 10045, 10046, 10047, 10048, 10049, 10050, 10051, 10052, 10053, 10054, 10055, 10056, 10057, 10058, 10059, 10060, 10061, 10062, 10063, 10064, 10065, 10066, 10067, 10068, 10069, 10070, 10071, 10072, 10073, 10074, 10075

In [None]:
# Query Milvus with search query

In [29]:
# Define vector question
question_vectors = embedding_fn.encode_queries(["What to use trusty AI for?"])

# Search data using a Vector base approach with questions and relationships
res = client.search(
    collection_name=collection_name,  # target collection
    data=question_vectors,  # query vectors
    limit=2,  # number of returned entities
    filter="version == '2.16'",
    output_fields=["text", "version"],  # specifies fields to be returned
)

print(res)

data: ["[{'id': 41, 'distance': 0.4809015393257141, 'entity': {'text': 'To use TrustyAI for bias monitoring or data drift detection, you must send training data for your model to TrustyAI.', 'version': '2.16'}}, {'id': 17, 'distance': 0.4694754183292389, 'entity': {'text': 'To allow your data scientists to use model monitoring with TrustyAI, you must enable the TrustyAI component in OpenShift AI.', 'version': '2.16'}}]"] 
