In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq

In [2]:
load_dotenv()

True

# Load the PDF Files

In [3]:
# !mkdir pdfs

In [None]:
# !gdown "https://drive.google.com/uc?id=1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE" -O pdfs/yolov7paper.pdf
# !gdown "https://drive.google.com/uc?id=1vILwiv6nS2wI3chxNabMgry3qnV67TxM" -O pdfs/rachelgreecv.pdf

# Extract the Text from the PDF's

In [5]:
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

# Split the Extracted Data into Text Chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)

# Embeddings

In [7]:
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")
results = hf_embeddings.embed_query("What is the YOLOv7 paper about?")
print(results)

  hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")
2025-03-24 19:48:33.997446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742824114.022978   22425 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742824114.029431   22425 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742824114.049996   22425 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742824114.050035   22425 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target mor

[-0.5099515318870544, 0.3396693468093872, 0.018211059272289276, -0.3898174464702606, -0.05946885421872139, -0.12047633528709412, 0.07061275094747543, 0.04617594555020332, 0.4419732987880707, 0.029263949021697044, -0.005520263686776161, 0.15431848168373108, 0.22259627282619476, -0.08958163857460022, -0.08633971959352493, -0.29083117842674255, 0.011318222619593143, -0.1661413311958313, -0.40186846256256104, 0.14409500360488892, 0.09455043822526932, 0.22527383267879486, 0.633211076259613, -0.08051353693008423, 0.0694948360323906, -0.2596285343170166, 0.3071894347667694, 0.29075026512145996, -0.10931214690208435, 0.14082826673984528, -0.1997317373752594, 0.43527019023895264, -0.011179097928106785, -0.04854302108287811, -0.5683408379554749, 0.6682410836219788, 0.5142979025840759, -0.04513942077755928, -0.2503037750720978, 0.265929251909256, 0.03157723322510719, 0.1909891664981842, 0.17394207417964935, 0.12888038158416748, 0.48200905323028564, -0.026408813893795013, -0.22784410417079926, -0.

In [8]:
len(results)

384

In [9]:
load_dotenv()
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY=os.environ.get('GROQ_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [10]:
load_dotenv()
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY) # One time run

index_name = "test-index"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
)

In [11]:
# Embed each chunk and upsert the embeddings into Pinecone index. Just one time run it
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=hf_embeddings, 
)

In [12]:
# Load Existing index 
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=hf_embeddings
)

In [13]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [14]:
retrieved_docs = retriever.invoke("What is Acetaminophen?")

In [15]:
retrieved_docs

[Document(id='d942c7f6-c061-4888-8567-4c792e5e495d', metadata={'page': 1.0, 'source': 'pdfs/yolov7paper.pdf'}, page_content='label assignment.\nThe contributions of this paper are summarized as fol-\nlows: (1) we design several trainable bag-of-freebies meth-\nods, so that real-time object detection can greatly improve\nthe detection accuracy without increasing the inference\ncost; (2) for the evolution of object detection methods, we\nfound two new issues, namely how re-parameterized mod-\nule replaces original module, and how dynamic label as-\nsignment strategy deals with assignment to different output'),
 Document(id='010d9a17-33b6-4d1a-a283-23c140058e5d', metadata={'page': 12.0, 'source': 'pdfs/yolov7paper.pdf'}, page_content='2018. 2\n[34] Paul F Jaeger, Simon AA Kohl, Sebastian Bickel-\nhaupt, Fabian Isensee, Tristan Anselm Kuder, Heinz-Peter\nSchlemmer, and Klaus H Maier-Hein. Retina U-Net: Em-\nbarrassingly simple exploitation of segmentation supervi-\nsion for medical object 

In [18]:
# Similarity search with scores
query = "What is Object Detection?"
results = docsearch.similarity_search_with_score(query, k=2)  # Retrieve top 2 most similar documents with scores

for i, (doc, score) in enumerate(results):
    print(f"Result {i+1}: {doc.page_content}\n Metadata: {doc.metadata}\n Similarity Score: {score}\n")

Result 1: formers for end-to-end object detection. In Proceedings of
the International Conference on Learning Representations
(ICLR), 2021. 10
15
 Metadata: {'page': 14.0, 'source': 'pdfs/yolov7paper.pdf'}
 Similarity Score: 0.656794906

Result 2: DETR, DINO-5scale-R50, ViT-Adapter-B and many other
object detectors in speed and accuracy. Moreover, we train
YOLOv7 only on MS COCO dataset from scratch without
using any other datasets or pre-trained weights. Source
code is released in https://github.com/WongKinYiu/yolov7.
1. Introduction
Real-time object detection is a very important topic in
computer vision, as it is often a necessary component in
computer vision systems. For example, multi-object track-
 Metadata: {'page': 0.0, 'source': 'pdfs/yolov7paper.pdf'}
 Similarity Score: 0.538994908



In [19]:
groq_llm = ChatGroq(api_key=GROQ_API_KEY, model_name="llama3-8b-8192", temperature=0.5)

In [20]:
from langchain.chains import RetrievalQA

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3}) # Use Pinecone as a retriever

qa_chain = RetrievalQA.from_chain_type( # Create a QA chain
    llm=groq_llm,
    retriever=retriever
)

response = qa_chain.run("How does Object Detection work?")
print(response)

  response = qa_chain.run("How does Object Detection work?")


Object detection is a computer vision technique that involves identifying and locating objects within an image or video. Here's a general overview of how it works:

1. **Image Preprocessing**: The input image is preprocessed to enhance its quality and prepare it for object detection. This may include resizing, normalizing, and converting the image to a suitable format.
2. **Feature Extraction**: A deep neural network is used to extract features from the input image. These features can be spatial (e.g., edges, corners) or semantic (e.g., object parts, textures).
3. **Object Proposal Generation**: The network generates a set of object proposals, which are regions of the image that are likely to contain objects. These proposals are typically generated using algorithms such as region proposal networks (RPNs) or selective search.
4. **Feature Integration**: The features extracted in step 2 are integrated with the object proposals to form a feature representation for each proposal. This is t