In [32]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

import config
import pandas as pd

In [33]:
from qdrant_client import QdrantClient

url = config.QDRANT_URL
api_key = config.QDRANT_API_KEY

qdrant_client = QdrantClient(
    url=url, 
    api_key=api_key,
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='flowers_policy'), CollectionDescription(name='flowers'), CollectionDescription(name='mlops_document')]


In [38]:
qdrant_client.delete_collection(collection_name="flowers")

True

In [39]:
qdrant_client.delete_collection(collection_name="flowers_policy")

False

# 1. Ingest Flower Products

## 2.1  Data Loading

In [4]:
df = pd.read_excel("../dataset/flowers.xlsx")

In [7]:
df.head()

Unnamed: 0,Product_Id,Product_Name,Best_Occasion,Description,Price_RM,Blooms,Bouquet_Size
0,P001,White On White,Anniversary,"Introducing our White On White Bouquet, a stun...",420,"White Hydrangea, Dahlia, White Tulips and Whit...",Medium
1,P002,Sweet Serenity,Anniversary,Allow the Sweet Serenity Bouquet to script you...,95,Cappucino Soap Rose and Light Pink Soap Rose,Small / Medium
2,P003,My Girl Bouquet,Anniversary,My Girl Bouquet Description\nLet’s get to know...,280,"Pink Avalanche, White Tulip and Eucalyptus Bab...",Medium
3,P004,Blooming Garden Bouquet,Anniversary,Introducing our stunning Blooming Garden Bouqu...,300,"Blue Hydrangea, Purple Roses, Purple Rose Spra...",Medium
4,P005,Sweet Aurora Bouquet,Anniversary,"Imagine this: It’s your friend’s birthday, and...",400,"Sweet Aurora Ecuador Roses, White Roses, Ocean...",Medium


## 2.2 Prepare Chunks

In [16]:
langchain_documents = []
for index, row in df.iterrows():
    product_id = row['Product_Id']
    product_name = row['Product_Name']
    best_occasion = row['Best_Occasion']
    description = row['Description']
    price_rm = row['Price_RM']
    blooms = row['Blooms']
    bouquet_size = row['Bouquet_Size']

    content = f"""Product Name: {product_name}
Best Occasion: {best_occasion}
Product Description: {description}
    """

    document = Document(
        page_content=content,
        metadata={
            "product_id":product_id,
            "product_name":product_name,
            "price_rm":price_rm,
            "blooms":blooms,
            "bouquet_size":bouquet_size
        }
    )

    langchain_documents.append(document)



In [17]:
langchain_documents

[Document(metadata={'product_id': 'P001', 'product_name': 'White On White', 'price_rm': 420, 'blooms': 'White Hydrangea, Dahlia, White Tulips and White Roses', 'bouquet_size': 'Medium'}, page_content='Product Name: White On White\nBest Occasion: Anniversary\nProduct Description: Introducing our White On White Bouquet, a stunning arrangement perfect for celebrating birthdays and cherished friendships.\n\nWhite Hydrangea, with its virtue and grace, adds an elegant touch to birthdays and symbolizes the gracefulness of a lasting friendship.\nMeanwhile, Dahlias represent eternal love and commitment, making them a wonderful choice to show your lasting friendship bond on a special day.\nThe White Tulips express respect and faith, emphasizing the trust and admiration in a friendship that deserves birthday celebrations.\nFinally, the premium White Roses convey value and anticipation, promising every birthday of good times ahead in your treasured friendship. A graceful reminder that each passing

## 2.3 Ingest to Qdrant DB

In [19]:
print("Ingest to Vector Database Start.")
url = config.QDRANT_URL
api_key = config.QDRANT_API_KEY

sparse_embeddings = FastEmbedSparse(
    model_name="Qdrant/bm25"
)

model_name = "jinaai/jina-embeddings-v3"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': False}
jina_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

embeddings = jina_embeddings

QdrantVectorStore.from_documents(
    langchain_documents,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    url=url,
    prefer_grpc=True,
    api_key=api_key,
    collection_name="flowers",
    retrieval_mode=RetrievalMode.HYBRID,
)
print("Ingest to Vector Database End.")

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Ingest to Vector Database End.


# 3. Ingest Policy

## 3.1 Load Document

In [29]:
import re

file_path = "../Online_Flower_Shop_Policies.pdf"
# Load the PDF document using PyPDFLoader
loader = PyPDFLoader(file_path)  # Replace with the path to your PDF file
documents = loader.load()

## 2.2 Prepare Chunks

In [30]:
text = ""
for document in documents:
    text += document.page_content

# Use regex to split the text based on numbering (works for single and double digit numbers)
sections = re.split(r'(?=\d{1,2}\.\s)', text.strip())  # Matches 1- or 2-digit numbers followed by '. '

# Remove empty strings and strip whitespace
sections = [section.strip() for section in sections if section.strip()]

langchain_documents = [Document(page_content=section, metadata={'document_type':'policy'}) for section in sections]

## 3.3 Ingest Policy to Qdrant

In [31]:
print("Ingest to Vector Database Start.")
url = config.QDRANT_URL
api_key = config.QDRANT_API_KEY

sparse_embeddings = FastEmbedSparse(
    model_name="Qdrant/bm25"
)

model_name = "jinaai/jina-embeddings-v3"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': False}
jina_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

embeddings = jina_embeddings

QdrantVectorStore.from_documents(
    langchain_documents,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    url=url,
    prefer_grpc=True,
    api_key=api_key,
    collection_name="flowers_policy",
    retrieval_mode=RetrievalMode.HYBRID,
)
print("Ingest to Vector Database End.")

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

Ingest to Vector Database End.
