In [22]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
import pprint
import pdfplumber
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.pdf import PyMuPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
import pytesseract
from PIL import Image
from qdrant_client import QdrantClient as RawQdrantClient
from qdrant_client.http import models
from langchain_experimental.text_splitter import SemanticChunker
import uuid
from sentence_transformers.cross_encoder import CrossEncoder
from langchain.prompts import PromptTemplate


In [2]:
## Langchain and Langsmith tracing
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT')
os.environ["LANGCHAIN_TRACING_V2"]="true"

## Getting Froq API key
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

## Getting qdrant vector db API key
os.environ['QDRANT_API_KEY'] = os.getenv('QDRANT_API_KEY')
os.environ['QDRANT_URL'] = os.getenv('QDRANT_URL')

EMBEDDING_MODEL = "all-MiniLM-L6-v2"
INFERENCE_MODEL = "gemma2-9b-it"
HNSW_COLLECTION_NAME = "hnsw_collection"
IVF_COLLECTION_NAME = "ivf_collection"
FLAT_COLLECTION_NAME = "flat_collection"

CONTENT_KEY = "text_content"

In [3]:
pdf_path = "Generative AI with LangChain (2024).pdf"
image_folder = "images"
os.makedirs(image_folder, exist_ok=True)

In [4]:
os.environ['PATH'] += os.pathsep + r'C:\\Program Files\\Tesseract-OCR'

## Helper Functions

### Function to extract texts, images and tables from pdf

In [5]:
def extract_elements_from_pdf(pdf_path):
    """Extracts texts, table contents and images from a PDF."""
    
    try:
        documents = []
        print(f"Extracting text contents from '{os.path.basename(pdf_path)}'")
        # Extract text contents using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text.strip():
                    metadata = {
                        "source": os.path.basename(pdf_path),
                        "page": page_num + 1,
                        "type": "text"
                    }
                    documents.append(Document(page_content=text, metadata=metadata))
        
        print(f"Extracting table contents from '{os.path.basename(pdf_path)}'")
        # Extract table contents using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table_num, table_data in enumerate(tables):
                    if table_data: # Ensure table_data is not None or empty
                        # Flatten table data into a string format
                        table_content = "\n".join(["\t".join(map(str, row)) for row in table_data if row])
                        if table_content.strip(): # Only add non-empty tables
                            metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": page_num + 1,
                                "table_num": table_num + 1,
                                "type": "table"
                            }
                            # Add a header to table content to distinguish it
                            documents.append(Document(page_content=f"Table {table_num+1} on page {page_num+1} contains:\n{table_content}", metadata=metadata))
        
        print(f"Extracting images from '{os.path.basename(pdf_path)}'")
        # Extract images using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                images = page.images
                for image_num, image in enumerate(images):
                    if image: # Ensure table_data is not None or empty
                        bbox = [image['x0'], page.cropbox[3]-image['y1'],  image['x1'], page.cropbox[3]-image['y0']]
                        img_page = page.crop(bbox=bbox)
                        img_obj = img_page.to_image(resolution=500)
                        # page_number = image['page_number']
                        image_name_prefix = f'{page_num}-{image_num + 1}'
                        image_name = f'{image_name_prefix}' + ".png"
                        image_path = f'{image_folder}\\{image_name}'
                        img_obj.save(image_path)
                        image_content = pytesseract.image_to_string(Image.open(image_path), lang='eng')
                        metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": page_num + 1,
                                "image_num": image_num + 1,
                                "type": "image",
                                "image_path":image_path
                            }
                        # Add a header to table content to distinguish it
                        documents.append(Document(page_content=f"Image {image_num+1} on page {page_num+1} contains:\n{image_content}", metadata=metadata))
        return documents
    except Exception as e:
            print(f"Error extracting details from {pdf_path}: {e}")

In [6]:
raw_docs = extract_elements_from_pdf(pdf_path)

Extracting text contents from 'Generative AI with LangChain (2024).pdf'
Extracting table contents from 'Generative AI with LangChain (2024).pdf'
Extracting images from 'Generative AI with LangChain (2024).pdf'


In [7]:
raw_docs[0]

Document(metadata={'source': 'Generative AI with LangChain (2024).pdf', 'page': 2, 'type': 'text'}, page_content='Generative AI with LangChain\nBuild large language model (LLM) apps with Python,\nChatGPT, and other LLMs\nBen Auffarth\nBIRMINGHAM—MUMBAI')

In [8]:
embeddings=HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
chunks = text_splitter.split_documents([doc for doc in raw_docs if doc.metadata['type']=='text'])
chunks.extend(doc for doc in raw_docs if doc.metadata['type']!='text')

In [10]:
len(chunks)

984

#### Embedding

In [11]:
def embed_docs(doc_chunks):
    chunk_texts_for_embedding = [chunk.page_content for chunk in doc_chunks]
    chunk_embeddings = embeddings.embed_documents(chunk_texts_for_embedding)
    upload_data = []
    for i, (chunk, vector) in enumerate(zip(doc_chunks, chunk_embeddings)):
        current_chunk_metadata = {}
        for k, v_meta in chunk.metadata.items():
            current_chunk_metadata[k] = v_meta

        payload_for_qdrant = {
            CONTENT_KEY: chunk.page_content,
            **current_chunk_metadata
        }
        # Generate a unique UUID for each point
        unique_id = str(uuid.uuid4()) 

        upload_data.append(
            models.PointStruct(
                id=unique_id, 
                payload=payload_for_qdrant,
                vector=vector
            )
        )
    return upload_data



In [12]:
data_to_upload = embed_docs(chunks)

In [None]:
# chunk_texts_for_embedding = [chunk.page_content for chunk in chunks]
# chunk_embeddings = embeddings.embed_documents(chunk_texts_for_embedding)
# #store the main text content within the Qdrant payload
# CONTENT_KEY_IN_PAYLOAD = "text_content_for_langchain"

# points_to_upsert = []
# for i, (chunk, vector) in enumerate(zip(chunks, chunk_embeddings)):
#     current_chunk_metadata = {}
#     for k, v_meta in chunk.metadata.items():
#         current_chunk_metadata[k] = v_meta

#     payload_for_qdrant = {
#         CONTENT_KEY_IN_PAYLOAD: chunk.page_content,
#         **current_chunk_metadata
#     }
#     # Generate a unique UUID for each point
#     unique_id = str(uuid.uuid4()) 

#     points_to_upsert.append(
#         models.PointStruct(
#             id=unique_id, 
#             payload=payload_for_qdrant,
#             vector=vector
#         )
#     )


In [156]:
# points_to_upsert[0].vector

In [157]:
# import pprint
# sample_payload = {}
# for k, v in points_to_upsert[0].payload.items():
#     if isinstance(v, str) and len(v) > 50:
#         sample_payload[k] = v[:50] + '...'
#     else:
#         sample_payload[k] = v

# pprint.pprint(sample_payload)

#### QDRANT Setup

In [13]:
class Qdrant:
    def __init__(self, url=os.environ['QDRANT_URL'], api_key=os.environ['QDRANT_API_KEY'], timeout=60):
        self.url = url
        self.api_key = api_key
        self.timeout = timeout
    
    def create_connnection(self):
        try:
            self.qdrant_client = RawQdrantClient(
                url=self.url,
                api_key=self.api_key,
                timeout=self.timeout # Increased timeout for potentially long operations
            )
            print("Connection successful.")
            return self.qdrant_client
        except Exception as e:
            self.qdrant_client = None # Set to None if fails
            raise Exception (f"Error connecting to Qdrant: {e}")
    
    def get_collection(self):
        try:
            self.collections = self.qdrant_client.get_collections()
            return self.collections
        except Exception as e:
            raise Exception (f"Error connecting to Qdrant: {e}")
    
    def create_collections(self, collection_name, index_type="flat"):
        existing_collections = []
        try:
            collections_response = self.get_collection()
            self.existing_collections = [c.name for c in collections_response.collections]
            if collection_name in existing_collections:
                print(f"  Collection '{collection_name}' already exists. No action needed.")
            else:
                if index_type == "flat":
                    self.qdrant_client.create_collection(
                        collection_name=collection_name,
                        vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
                    )
                    self.existing_collections.append(collection_name)
                elif index_type == "hnsw":
                    self.qdrant_client.create_collection(
                        collection_name=collection_name,
                        vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
                        hnsw_config={
                            "m": 16,  # number of connections per layer
                            "ef_construct": 100,  # size of the dynamic candidate list
                            "full_scan_threshold": 10000  # threshold for switching to brute force search
                        }
                    )
                    self.existing_collections.append(collection_name)
            return self.existing_collections
        except Exception as e:
                raise Exception (f"Error connecting to Qdrant: {e}")
    
    def upsert_embeddings(self, embeddings_list):
        BATCH_SIZE = 100
        for collection_name in self.existing_collections:
            for i in range(0, len(embeddings_list), BATCH_SIZE):
                batch_of_points = embeddings_list[i : i + BATCH_SIZE]
                try:
                    self.qdrant_client.upsert(
                        collection_name=collection_name,
                        points=batch_of_points,
                        wait=True # Wait for the batch to complete
                    )
                    count_result = self.qdrant_client.count(collection_name=collection_name, exact=True)
                    print(f"{count_result} is currently present in collection {collection_name}")
                except Exception as e:
                    print(f"Error upserting data into '{collection_name}': {e}")
                    continue

    def search_vectors(self, collection_name, query_vector, top_results):
        search_result_points = self.qdrant_client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=top_results,
            with_payload=True,
            with_vectors=False
        )
        manually_created_docs = []
        if search_result_points:
            for i, point in enumerate(search_result_points):
                if point.payload and CONTENT_KEY in point.payload:
                    doc_content = point.payload[CONTENT_KEY]
                    doc_metadata = {k: v for k, v in point.payload.items() if k != CONTENT_KEY}
                    doc_metadata["score"] = point.score
                    manually_created_docs.append(
                        Document(page_content=doc_content, metadata=doc_metadata)
                    )
        return manually_created_docs
                

        


In [None]:
# qdrant_client = RawQdrantClient(
#                 url=os.environ['QDRANT_URL'],
#                 api_key=os.environ['QDRANT_API_KEY'],
#                 timeout=60 # Increased timeout for potentially long operations
#             )
# qdrant_client.delete_collection(collection_name="hnsw_collection")
# qdrant_client.delete_collection(collection_name="flat_collection")

True

In [14]:
qdrant = Qdrant()
qdrant.create_connnection()
qdrant.create_collections(FLAT_COLLECTION_NAME)
qdrant.create_collections(HNSW_COLLECTION_NAME)

Connection successful.


['flat_collection', 'hnsw_collection']

In [15]:
qdrant.upsert_embeddings(data_to_upload)

count=100 is currently present in collection flat_collection
count=200 is currently present in collection flat_collection
count=300 is currently present in collection flat_collection
count=400 is currently present in collection flat_collection
count=500 is currently present in collection flat_collection
count=600 is currently present in collection flat_collection
count=700 is currently present in collection flat_collection
count=800 is currently present in collection flat_collection
count=900 is currently present in collection flat_collection
count=984 is currently present in collection flat_collection
count=100 is currently present in collection hnsw_collection
count=200 is currently present in collection hnsw_collection
count=300 is currently present in collection hnsw_collection
count=400 is currently present in collection hnsw_collection
count=500 is currently present in collection hnsw_collection
count=600 is currently present in collection hnsw_collection
count=700 is currently p

In [16]:
# qdrant_client.create_collection(
#     collection_name=HNSW_COLLECTION_NAME,
#     vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
#     hnsw_config={
#             "m": 16,  # number of connections per layer
#             "ef_construct": 100,  # size of the dynamic candidate list
#             "full_scan_threshold": 10000  # threshold for switching to brute force search
#         }
# )

In [17]:
# qdrant_client.create_collection(
#     collection_name=FLAT_COLLECTION_NAME,
#     vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
# )

In [18]:
# collections_response = qdrant_client.get_collections()
# collections_response

### Retriever Function

In [19]:
import time
def retrieve_documents(query, collection_name, embeddings_model, index_type, top_results):
    print('Retrieving Documents initially from the Vector Database.....')
    retrieval_times = {}
    query_vector = embeddings_model.embed_query(query)
    start_time = time.time()
    retrieved_docs = qdrant.search_vectors(collection_name, query_vector, top_results)
    end_time = time.time()
    duration = end_time - start_time
    retrieval_times[index_type] = duration
    print(f"  Retrieved {len(retrieved_docs)} documents in {duration:.4f} seconds.")
    return retrieved_docs, duration

    

In [151]:
# query = "How to build an application for customer service?"

# retrieved_docs_flat, retrieval_time_flat = retrieve_documents(query, 'flat_collection', embeddings, 'flat', 3)
# retrieved_docs_hnsw, retrieval_time_hnsw = retrieve_documents(query, 'hnsw_collection', embeddings, 'hnsw', 3)

In [150]:
# retrieved_docs_flat

In [5]:
# retrieved_docs_hnsw

In [6]:
# [doc.page_content for doc in retrieved_docs_hnsw]

### Re-Ranking

In [27]:
def re_rank_retrieved_docs(query, collection, index_type, top_reranked_retrievals=5):
    print('Re Ranking the Retrieved Documents using CrossEncoder')
    reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    docs_without_reranking, retrieval_time = retrieve_documents(
        query=query,
        collection_name=collection,
        embeddings_model=embeddings,
        index_type=index_type,
        top_results=10
    )
    if docs_without_reranking:
        sentence_pairs = [[query, doc.page_content] for doc in docs_without_reranking]
        rerank_scores = reranker_model.predict(sentence_pairs)

        # Combine documents with their rerank scores
        docs_with_rerank_scores = list(zip(docs_without_reranking, rerank_scores))

        # Sort the documents by rerank score in descending order
        reranked_docs_with_scores = sorted(docs_with_rerank_scores, key=lambda item: item[1], reverse=True)
        reranked_docs = [doc[0] for doc in reranked_docs_with_scores]

        print("Reranked Documents (Top 5):")
        for i, (doc, score) in enumerate(reranked_docs_with_scores[:top_reranked_retrievals]): # Print top 5 reranked results
            print(f"{i+1}. Rerank Score: {score:.4f}, Vector Score: {doc.metadata['score']:.4f}, Page: {doc.metadata['page']}, Source: {doc.metadata['source']}")
        
        return reranked_docs[:top_reranked_retrievals]

In [7]:
# reranked_output = re_rank_retrieved_docs(query, 'hnsw_collection', 'hnsw')
# print("^^^^^^^^^^^^^^^^^^^^^^^^^^^")
# print(reranked_output)
# print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
# # print([doc[0].page_content for doc in reranked_output])

### RAG CHAIN

In [24]:
def create_prompt(query,reranked_docs):
    prompt_template = PromptTemplate(
    input_variables=["query", "context"],
    template="""
    You are an expert AI assistant. Answer the following question based on the provided context.

    Question: {query}

    Context:
    {context}

    Instructions:
    1. Provide a clear, concise answer.
    2. If the context doesn't contain the answer, say "I could not find an answer."

    Answer:
    """
    )
    # Example: Format the prompt with retrieved docs
    documents_str = "\n\n".join([f"Document {i+1}:\n{doc.page_content[:500]}..." for i, doc in enumerate(reranked_docs)])

    prompt = prompt_template.format(
    query=query,
    context=documents_str
    )
    print("===== GENERATED PROMPT =====")
    return prompt

    


In [8]:
# import pprint
# pprint.pprint(create_prompt("What is Fake LLM?",reranked_output))

In [30]:
def run_rag_chain(query, collection, index_type):
    # 1. Document Retrieval and Reranking
    reranked_retrieved_docs = re_rank_retrieved_docs(
        query=query,
        collection=collection,
        index_type=index_type,
        top_reranked_retrievals=5
    )

    prompt = create_prompt(query, reranked_retrieved_docs)

    # print("***********************************************")

    # print(f"Prompt: {prompt}")

    # print("***********************************************")
    llm = ChatGroq(model=INFERENCE_MODEL, temperature=0.3)
    llm_response_obj = llm.invoke(prompt)
    return llm_response_obj.content

In [31]:
run_rag_chain("What is Fake LLM?", 'hnsw_collection', 'hnsw')

Re Ranking the Retrieved Documents using CrossEncoder
Retrieving Documents initially from the Vector Database.....


  search_result_points = self.qdrant_client.search(


  Retrieved 10 documents in 0.8419 seconds.
Reranked Documents (Top 5):
1. Rerank Score: 5.7613, Vector Score: 0.5781, Page: 95, Source: Generative AI with LangChain (2024).pdf
2. Rerank Score: -1.0813, Vector Score: 0.4570, Page: 123, Source: Generative AI with LangChain (2024).pdf
3. Rerank Score: -1.7648, Vector Score: 0.4845, Page: 66, Source: Generative AI with LangChain (2024).pdf
4. Rerank Score: -2.1071, Vector Score: 0.4860, Page: 62, Source: Generative AI with LangChain (2024).pdf
5. Rerank Score: -2.6208, Vector Score: 0.4486, Page: 150, Source: Generative AI with LangChain (2024).pdf
===== GENERATED PROMPT =====


"Fake LLM is a tool from the langchain library that allows you to simulate the behavior of a real Large Language Model (LLM) without actually using one.  It's useful for testing and prototyping because it avoids rate limits and lets you specify predefined responses. \n"