# Prerequisites

In [1]:
%%capture
pip install requests docling transformers ipywidgets minio

In [2]:
%%capture
pip install llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-llms-huggingface-api llama-index-vector-stores-milvus llama-index-readers-file python-dotenv;

In [3]:
## Docling and chunker for text parsing
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
## embeddings generator
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# Docling ingestion

## Define global variable

In [4]:
## Source for the documents for docling parsing of chunks
source_file = "https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/pdf/monitoring_data_science_models/Red_Hat_OpenShift_AI_Self-Managed-2.16-Monitoring_data_science_models-en-US.pdf"  # document per local path or URL

## 0) Ingest documentation - Test

In [5]:
## Define Converter
converter = DocumentConverter()

## Parse document from source and store in variable "document"
document = converter.convert(source_file)


## Print to check everything is working
# print(document.document.export_to_markdown())
lines = document.document.export_to_markdown().splitlines()
for line in lines[:10]:
    print(line)

<!-- image -->

## Red Hat OpenShift AI Self-Managed 2.16

## Monitoring data science models

Monitor your OpenShift AI models for fairness

## Red Hat OpenShift AI Self-Managed 2.16 Monitoring data science models



## Ingesting Multiple files

In [None]:
source_files=["https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/pdf/monitoring_data_science_models/Red_Hat_OpenShift_AI_Self-Managed-2.16-Monitoring_data_science_models-en-US.pdf", ""]

converter = DocumentConverter()

files = new Docling.Document()

for i in source_files:
    document = converter.convert(i)
    files.append(document)
    

## 1) Chunking the downloaded document using Docling

In [6]:
## Parse document from source and store in variable "document"
converted_source_file = DocumentConverter().convert(source_file)
document = converted_source_file.document

## Create chubker and chuck document
chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5")  # set tokenizer as needed
chunk_iter = chunker.chunk(document)

## Create chunk_list with the parts of the document
chunk_list = list(chunk_iter)

#for i,chunk in enumerate(chunk_iter):
    #print(i)
    #print(chunk)

## Print to check everything is working
# print(chunk_list[0])
print(chunk_list[0].text)

Token indices sequence length is longer than the specified maximum sequence length for this model (925 > 512). Running this sequence through the model will result in indexing errors


Monitor your OpenShift AI models for fairness
Last Updated: 2024-12-11


## 2) Create embeddings

In [7]:
## Defined embedding model and import it
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

## Define Empty Vector Array
vectors = []
## Populate Vector Array

for i, chunk in enumerate(chunk_list):
    vectors.append({
        "id": i, 
        "vector": embedding_model.get_text_embedding(chunk.text) , 
        "text": chunk.text
    })

## Print to check everything is working
print(f'Length of the Vector: {len(vectors)}' )

Length of the Vector: 112


## 3) Upload embeddings to Milvus

In [8]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

## Connect to Milvus (adjust host/port as needed)
connections.connect("default", 
    host="vectordb-milvus.milvus.svc.cluster.local", 
    port="19530",
    token="root:Milvus"
)

## Define a collection schema (adjust dimensions based on your embedding size)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=len(vectors[0]["vector"])),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65000),
    ## TODO: Add version
    #    FieldSchema(name="version", dtype=DataType.VARCHAR, max_length=1024),
]
schema = CollectionSchema(fields, description="RAG embeddings collection")

## Create or load a collection
collection_name = "rag_embeddings"
collection = Collection(name=collection_name, schema=schema)

## TODO: Create an index for faster similarity search
index_params = {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": len(vectors[0]["vector"])}}
collection.create_index(field_name="vector", index_params=index_params)

## Load the collection for querying
collection.load()

In [9]:
## Insert vectors into Milvus
insert_output = collection.insert(vectors);

## Print to check everything is working
print(insert_output)

## Load the collection for querying
collection.load()

(insert count: 112, delete count: 0, upsert count: 0, timestamp: 455326334817665028, success count: 112, err count: 0


## 3) Query the Database to obtain RAG parsed information

In [10]:
## TODO: search
# res = client.search(
#     collection_name="my_collection",
#     data=[query_vector],
#     limit=5,
#     # highlight-start
#     filter='color like "red%" and likes > 50',
#     output_fields=["color", "likes"]
#     # highlight-end
# )

In [11]:
## TODO: 
- Creating pipeline to store more data in Vector Database

SyntaxError: invalid syntax (3796298882.py, line 2)

In [15]:
# Search data
query_vectors = embedding_model.get_text_embedding("What is Openshift AI")

res = collection.search(
    # target collection
    data=query_vectors,  # query vectors
    limit=3,
    anns_field="vector",
    param={"metric_type": "IP"},
    output_fields=["text"]
)

2025-01-15 09:30:44,428 [ERROR][handler]: RPC error: [search], <ParamError: (code=1, message=`search_data` value [0.018916407600045204, -0.06553012877702713, -0.03890419006347656, -0.000947585329413414, 0.04297740012407303, -0.007084385491907597, 0.005829900037497282, 0.006629869341850281, 0.062475334852933884, -0.03228267654776573, 0.01589363068342209, -0.02252667024731636, 0.06959574669599533, 0.06449303030967712, 0.05231038108468056, 0.030422084033489227, -0.04782497510313988, 0.019731031730771065, -0.01748519577085972, 0.0006630318239331245, 0.05464634299278259, -0.06150694936513901, -0.003995751962065697, -0.06004820391535759, -0.06094786152243614, 0.05040329322218895, -0.015113306231796741, -0.04495735466480255, -0.07351983338594437, -0.10985855013132095, -0.018881183117628098, 0.0063013723120093346, 0.04261249676346779, -0.02894873358309269, 0.043270234018564224, 0.009354321286082268, -0.041576217859983444, -0.05026301369071007, -0.07358546555042267, -0.008998801000416279, -0.00

ParamError: <ParamError: (code=1, message=`search_data` value [0.018916407600045204, -0.06553012877702713, -0.03890419006347656, -0.000947585329413414, 0.04297740012407303, -0.007084385491907597, 0.005829900037497282, 0.006629869341850281, 0.062475334852933884, -0.03228267654776573, 0.01589363068342209, -0.02252667024731636, 0.06959574669599533, 0.06449303030967712, 0.05231038108468056, 0.030422084033489227, -0.04782497510313988, 0.019731031730771065, -0.01748519577085972, 0.0006630318239331245, 0.05464634299278259, -0.06150694936513901, -0.003995751962065697, -0.06004820391535759, -0.06094786152243614, 0.05040329322218895, -0.015113306231796741, -0.04495735466480255, -0.07351983338594437, -0.10985855013132095, -0.018881183117628098, 0.0063013723120093346, 0.04261249676346779, -0.02894873358309269, 0.043270234018564224, 0.009354321286082268, -0.041576217859983444, -0.05026301369071007, -0.07358546555042267, -0.008998801000416279, -0.007698061875998974, 0.022255172953009605, -0.004397645592689514, -0.015059944242238998, 0.0831492468714714, -0.01604350097477436, 0.010776412673294544, 0.017013780772686005, -0.054315485060214996, -0.04125639796257019, -0.04946082830429077, -0.0012736389180645347, -0.015222780406475067, 0.006795905996114016, 0.06959494203329086, 0.007623095065355301, -0.008538107387721539, 0.0578058660030365, -0.0022369546350091696, 0.07100000232458115, 0.055097997188568115, 0.0427919365465641, -0.08446279168128967, 0.053637076169252396, 0.05768907815217972, 0.0012872250517830253, 0.02144758217036724, -0.09577302634716034, 0.05184632167220116, -0.06348399072885513, 0.0062585556879639626, -0.04705648869276047, 0.062132008373737335, -0.019804377108812332, -0.013168083503842354, 0.05572675168514252, -0.03303905948996544, 0.019097886979579926, 0.011534648947417736, 0.021246587857604027, -0.043726835399866104, -0.019137462601065636, -0.061243295669555664, 0.00392927648499608, -0.08417265862226486, 0.00645196670666337, -0.024148056283593178, 0.005087278317660093, 0.0854397565126419, -0.02899695187807083, -0.031521640717983246, 0.07119147479534149, 0.03867705538868904, -0.023044994100928307, 0.010080136358737946, 0.06939459592103958, 0.02845858410000801, -0.0035702220629900694, -0.010535236448049545, 0.28993064165115356, -0.016074128448963165, 0.0004339422448538244, -0.06854154914617538, -0.0093361996114254, 0.047807496041059494, -0.02204877883195877, -0.01015295460820198, -0.005731453653424978, -0.060381099581718445, 0.02822890318930149, -0.00021379154350142926, -0.03201304003596306, 0.09956123679876328, -0.017470991238951683, 0.009059048257768154, -0.01320360042154789, -0.007169108372181654, -0.008328552357852459, -0.014437008649110794, 0.03164815902709961, -0.016301492229104042, 0.00913591030985117, 0.004898780025541782, 0.016929879784584045, -0.04300551116466522, 0.010717085562646389, 0.020432548597455025, 0.08295033127069473, 0.00839648675173521, 0.021001312881708145, 0.03629259392619133, 0.048934515565633774, -0.04647490382194519, -0.0576329380273819, 0.056297264993190765, 0.0604933463037014, 0.024427810683846474, 0.016238372772932053, -0.01918506994843483, 0.05290035158395767, -0.07003923505544662, 0.01629437878727913, 0.029389750212430954, -0.08910703659057617, -0.02404778078198433, 0.051047611981630325, 0.0447651669383049, 0.041296012699604034, -0.044944245368242264, -0.05950205773115158, 0.029919343069195747, 0.009315788745880127, -0.04078936204314232, -0.07104609906673431, 0.02660961262881756, 0.017170604318380356, 0.033225443214178085, -0.015238919295370579, -0.06505544483661652, 0.059563178569078445, -0.05990481376647949, -0.06625521928071976, -0.04592597484588623, 0.07326801866292953, -0.00017535555525682867, -0.13208112120628357, -0.0034760802518576384, -0.025032224133610725, -0.042149562388658524, 0.02275683917105198, 0.012261985801160336, 0.06016125530004501, -0.03385382890701294, 0.03462795913219452, 0.0289654191583395, -0.08043434470891953, -0.008724894374608994, -0.015058978460729122, 0.003875496331602335, 0.02852492406964302, -0.041415080428123474, -0.06998664885759354, 0.0020724984351545572, 0.02975034900009632, 0.006759108975529671, -0.08291278779506683, -0.06360547244548798, -0.05026177689433098, -0.06335426867008209, -0.05577607825398445, -0.0811443105340004, 0.04104302451014519, -0.004670990165323019, 0.04239952191710472, -0.01759614422917366, -0.02873746119439602, -0.05029812082648277, 0.04647010564804077, -0.03614570572972298, 0.03414465859532356, 0.03207585588097572, 0.043543606996536255, 0.031292397528886795, 0.02018790878355503, -0.04402806982398033, -0.04027944803237915, 0.0029055362101644278, -0.04585706815123558, 0.0211949422955513, 0.052010130137205124, -0.02859794721007347, 0.012887316755950451, -0.04489249736070633, 0.05199668928980827, -0.054242152720689774, -0.013340410776436329, -0.0010513286106288433, -0.053708042949438095, -0.004453436937183142, 0.01259155385196209, 0.011384609155356884, 0.02481977269053459, -0.06198849529027939, -0.2473316192626953, -0.02437554858624935, -0.0029716636054217815, -0.015055270865559578, 0.007430526427924633, -0.10436369478702545, -0.020477551966905594, -0.055227234959602356, 0.04263640195131302, 0.048732269555330276, 0.127870574593544, 0.0422961600124836, 0.014314068481326103, -0.004998255055397749, 0.028990862891077995, -0.03651002049446106, -0.009906751103699207, 0.07399642467498779, -0.08835192024707794, 0.070921890437603, 0.03788944333791733, 0.005641958210617304, -0.009462005458772182, -0.14033597707748413, 0.010037088766694069, -0.0012965654022991657, 0.1380702257156372, -0.020929744467139244, 0.058072853833436966, 0.013981131836771965, -0.037815313786268234, -0.008091791532933712, -0.014877509325742722, -0.13134950399398804, 0.01881743222475052, 0.005810076836496592, 0.0690159797668457, -0.003965320531278849, 0.01892230473458767, 0.04026143625378609, 0.017595704644918442, 0.017711753025650978, 0.0323314368724823, -0.0884469673037529, 0.007580885197967291, 0.022513434290885925, -0.013891304843127728, 0.002361989812925458, -0.026247361674904823, 0.009098256938159466, -0.021430082619190216, 0.043461572378873825, 0.035343240946531296, 0.05930247902870178, -0.01568414643406868, -0.02329287678003311, -0.024778129532933235, 0.041710082441568375, 0.04856671392917633, 0.014119395054876804, 0.04999740049242973, 0.02924773283302784, -0.07315744459629059, -0.04303267225623131, 0.07131733745336533, -0.017308592796325684, 0.027680162340402603, 0.010583293624222279, 0.05216563493013382, -0.09472702443599701, -0.03109320066869259, 0.06465733051300049, 0.00025838016881607473, 0.09920822083950043, 0.07650504261255264, -0.02714710496366024, 0.04131709784269333, -0.018417643383145332, 0.022853493690490723, 0.044623859226703644, 0.03734313324093819, -0.018173275515437126, 0.017117086797952652, 0.028304029256105423, -0.006545831449329853, 0.03890783712267876, 0.004134294111281633, -0.04690399020910263, 0.06927969306707382, 0.06885416060686111, 0.045886460691690445, 0.06108104810118675, -0.026723425835371017, -0.014396348036825657, 0.02831229940056801, 0.008198893629014492, -0.2815168499946594, 0.07186111807823181, -0.013996822759509087, -0.0004056303296238184, -0.03831950202584267, -4.600199099513702e-05, 0.02621791884303093, 0.033543363213539124, -0.03694223612546921, -0.04135531932115555, -0.05505620688199997, -0.007374187465757132, -0.007880053482949734, -0.013117294758558273, -0.000665895699057728, 0.015499880537390709, 0.08483698964118958, -0.0033284316305071115, 0.017109548673033714, -0.08784622699022293, -0.0077406177297234535, 0.07806865125894547, 0.1826341450214386, -0.04724520072340965, 0.05169909819960594, 0.04642165079712868, -0.04020216315984726, -0.02172544039785862, 0.014434612356126308, -0.04118296131491661, 0.01698930375277996, -0.04317622631788254, 0.026210665702819824, -0.012787166982889175, 0.03106512874364853, 0.012932699173688889, -0.02985643967986107, -0.011660393327474594, 0.016012443229556084, -0.04904380440711975, -0.017613502219319344, 0.020533669739961624, 0.051769405603408813, 0.05171139910817146, 0.011245272122323513, 0.008069965988397598, -0.05360205098986626, 0.026347853243350983, 0.03299499675631523, 0.0245467871427536, 0.055100683122873306, -0.05169588327407837, -0.015592521987855434, 0.012754932045936584, 0.008138516917824745, 0.030805012211203575, 0.044433627277612686, -0.007207938004285097, -0.0654120072722435, -0.08089514076709747, -0.005371483042836189, -0.01950402557849884, 0.04683103784918785, 0.04867231473326683, -0.022748669609427452] is illegal)>

## 4) Query Mistral Using RAG information

In [None]:
prompt =f"""
You are a commentator. Your task is to write a report on an essay.
When presented with the essay, come up with interesting questions to ask, and answer each question.
Afterward, combine all the information and write a report in the markdown format.

# Essay:
{markdown_dummy_2}

# Instructions:
## Summarize:
In clear and concise language, summarize the key points and themes presented in the essay.

## Interesting Questions:
Generate three distinct and thought-provoking questions that can be asked about the content of the essay. For each question:
- After "Q: ", describe the problem
- After "A: ", provide a detailed explanation of the problem addressed in the question.
- Enclose the ultimate answer in <>.

## Write a report
Using the essay summary and the answers to the interesting questions, create a comprehensive report in Markdown format.
"""

