In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings

import config



In [2]:
from qdrant_client import QdrantClient

url = config.QDRANT_URL
api_key = config.QDRANT_API_KEY

qdrant_client = QdrantClient(
    url=url, 
    api_key=api_key,
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='mlops_document')]


In [3]:
qdrant_client.delete_collection('mlops_document')

True

# 1. Load pdfs and split chunks

In [4]:
# -------------------------------------------------------------------------
# 1. Load multiple PDFs using PyPDFLoader, only if the file is a PDF
# -------------------------------------------------------------------------
print("Loading PDFs Start.")

file_paths = config.PDF_FILE_PATHS
docs = []

for file_path in file_paths:
    # Check if the file is a PDF
    if file_path.lower().endswith(".pdf"):
        loader = PyPDFLoader(file_path)
        docs.extend(loader.load())  # Combine all loaded docs into a single list
    else:
        print(f"Skipping non-PDF file: {file_path}")

print("Loading PDFs End.")

Loading PDFs Start.
Loading PDFs End.


In [9]:
# -------------------------------------------------------------------------
# 2. Split the docs into smaller chunks
# -------------------------------------------------------------------------
print("Splitting Documents into Chunks Start.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)
print("Splitting Documents into Chunks End.")

Splitting Documents into Chunks Start.
Splitting Documents into Chunks End.


# 2. Ingest into Vector DB

In [None]:
# -------------------------------------------------------------------------
# 2. Ingest to Vector Database: Qdrant
# -------------------------------------------------------------------------
print("Ingest to Vector Database Start.")
url = config.QDRANT_URL
api_key = config.QDRANT_API_KEY

sparse_embeddings = FastEmbedSparse(
    model_name="Qdrant/bm25"
)

# Initialize text embeddings
model_name = "jinaai/jina-embeddings-v3"
model_kwargs = {'device': 'cuda', "trust_remote_code":True}
encode_kwargs = {'normalize_embeddings': True}
hf_embed =  HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# embeddings = OllamaEmbeddings(
#     model="nomic-embed-text:v1.5"
# )

embeddings = hf_embed

QdrantVectorStore.from_documents(
    chunks,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    url=url,
    prefer_grpc=True,
    api_key=api_key,
    collection_name="mlops_document",
    retrieval_mode=RetrievalMode.HYBRID,
)
print("Ingest to Vector Database End.")

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Ingest to Vector Database End.


# Final: Put everything into one function

In [None]:
def ingest_to_vector_db():
    # -------------------------------------------------------------------------
    # 1. Load multiple PDFs using PyPDFLoader, only if the file is a PDF
    # -------------------------------------------------------------------------
    print("Loading PDFs Start.")

    file_paths = config.PDF_FILE_PATHS
    docs = []

    for file_path in file_paths:
        # Check if the file is a PDF
        if file_path.lower().endswith(".pdf"):
            loader = PyPDFLoader(file_path)
            docs.extend(loader.load())  # Combine all loaded docs into a single list
        else:
            print(f"Skipping non-PDF file: {file_path}")

    print("Loading PDFs End.")

    # -------------------------------------------------------------------------
    # 2. Split the docs into smaller chunks
    # -------------------------------------------------------------------------
    print("Splitting Documents into Chunks Start.")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
    chunks = text_splitter.split_documents(docs)
    print("Splitting Documents into Chunks End.")

    # -------------------------------------------------------------------------
    # 3. Ingest to Vector Database: Qdrant
    # -------------------------------------------------------------------------
    # -------------------------------------------------------------------------
    # 2. Ingest to Vector Database: Qdrant
    # -------------------------------------------------------------------------
    print("Ingest to Vector Database Start.")
    url = config.QDRANT_URL
    api_key = config.QDRANT_API_KEY

    sparse_embeddings = FastEmbedSparse(
        model_name="Qdrant/bm25"
    )

    # Initialize text embeddings
    model_name = "jinaai/jina-embeddings-v3"
    model_kwargs = {'device': 'cuda', "trust_remote_code":True}
    encode_kwargs = {'normalize_embeddings': True}
    hf_embed =  HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )

    # embeddings = OllamaEmbeddings(
    #     model="nomic-embed-text:v1.5"
    # )

    embeddings = hf_embed

    QdrantVectorStore.from_documents(
        chunks,
        embedding=embeddings,
        sparse_embedding=sparse_embeddings,
        url=url,
        prefer_grpc=True,
        api_key=api_key,
        collection_name="mlops_document",
        retrieval_mode=RetrievalMode.HYBRID,
    )
    print("Ingest to Vector Database End.")

ingest_to_vector_db()
