# Pip Installs

In [31]:
# pip install tiktoken

In [32]:
# pip install pinecone

In [33]:
# pip install pymupdf

In [34]:
# pip install langchain_pinecone

In [35]:
# pip install langchain_openai

# Code

In [36]:
import fitz
import os
import tiktoken
from dotenv import load_dotenv
import openai
from pinecone import Pinecone, ServerlessSpec

import langchain
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.chains import RetrievalQA

## Testing pdf loader

In [37]:
class PDFLoader:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path

    def extract_text(self):
        doc = fitz.open(self.pdf_path)
        text = ""

        for page in doc:
            text += page.get_text()
        return text
    
if __name__ == '__main__':
    loader = PDFLoader("List_of_countries_by_air_pollution.pdf")
    text = loader.extract_text()
    print(text)
    

List of countries by air pollution
The following list of countries by air pollution sorts the countries of the world according to their average
measured concentration of particulate matter (PM2.5) in micrograms per cubic meter (μg/m3). The World
Health Organization's recommended limit is 5 micrograms per cubic meter, although there are also
various national guideline values, which are often much higher. Air pollution is among the biggest health
problems of modern industrial society and is responsible for more than 10 percent of all deaths worldwide
(nearly 4.5 million premature deaths in 2019), according to The Lancet. Air pollution can affect nearly
every organ and system of the body, negatively affecting nature and humans alike. Air pollution is a
particularly big problem in emerging and developing countries, where global environmental standards
often cannot be met. The data in this list refers only to outdoor air quality and not indoor air quality,
which caused an additional two mil

In [38]:
load_dotenv()

class EmbeddingGenerator:
    def __init__(self):
        openai_api_key = os.getenv("OPENAI_API_KEY")
        if not openai_api_key:
            raise ValueError("OPENAI_API_KEY not set")
        
    # split into chunks by tokens
    def chunk_text_by_tokens(self, text, chunk_size, encoding_name="cl100k_base"):
        encoding = tiktoken.get_encoding(encoding_name)
        tokens = encoding.encode(text)
        return [encoding.decode(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]

    # generate embeddings for each chunk and return list of embeddings
    def generate_embeddings(self, chunks):
        embeddings = []
        for chunk in chunks:
            response = openai.embeddings.create(
                input=chunk,
                model="text-embedding-ada-002"
            )
            embeddings.append(response.data[0].embedding)
        return embeddings
    
    # split text into chunks and generate embeddings
    def process_text(self, text, chunk_size=1000):
        chunks = self.chunk_text_by_tokens(text, chunk_size)
        embeddings = self.generate_embeddings(chunks)
        return chunks, embeddings

if __name__ == "__main__":
    generator = EmbeddingGenerator()
    chunks, embeddings = generator.process_text(text, chunk_size=800)

In [44]:
class PineconeStore:

    def __init__(self, environmeent="us-east-1"):
        pinecone_api_key = os.getenv("PINECONE_API_KEY")
        if not pinecone_api_key:
            raise ValueError("PINECONE_API_KEY not set")
        
        # pinecone instance
        self.pc = Pinecone(ap_key=pinecone_api_key)
        self.index_name = "pdf-vector-store-pollution"

        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                dimension=1536,
                metric='cosine',
                spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    
    def save_vectors(self, vectors, metadata, chunks):
        index = self.pc.Index(self.index_name)

        # save each embedding with unique metadata
        for i, vector in enumerate(vectors):
            vector_id = f"{metadata['id']}_chunk_{i}" # uniqe id
            chunk_metadata = {
                "id": vector_id,
                "source" : metadata["source"],
                "chunk" : i,
                "text": chunks[i]
            }

            index.upsert(vectors=[(vector_id, vector, chunk_metadata)])

if __name__ == "__main__":
    vector_store = PineconeStore()
    vector_store.save_vectors(embeddings, {"id": "doc_1", "source": "example.pdf"}, chunks)


In [45]:
class PineconeRetriever:
    def __init__(self, pinecone_api_key, openai_api_key):

        # Initialize Pinecone connection
        self.pc = Pinecone(api_key=pinecone_api_key)
        self.index_name = "pdf-vector-store-pollution"
        self.index = self.pc.Index(self.index_name)

        # Initialize OpenAI model and embeddings
        self.embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
        self.llm = OpenAI(temperature=0, api_key=openai_api_key)

        # Create the Pinecone vector store
        self.vector_store = PineconeVectorStore(index=self.index, embedding=self.embedding_model, text_key="text")
        self.retriever = self.vector_store.as_retriever()

        # Create the RetrievalQA chain
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.retriever)

    def query(self, query_text):
        # Execute the QA chain with the input query
        response = self.qa_chain.invoke({"query": query_text})
        return response['result']


In [47]:
if __name__ == '__main__':
    # Replace with your actual API keys
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    openai_api_key = os.getenv("OPENAI_API_KEY")

    retriever = PineconeRetriever(pinecone_api_key=pinecone_api_key, openai_api_key=openai_api_key)
    result = retriever.query("Tell me about the air pollution in India and its nearby countries.")
    print(result)

 According to the 2022 list of countries by air pollution, India ranks second with an average measured concentration of particulate matter (PM2.5) at 41.39 μg/m3. This is significantly higher than the World Health Organization's recommended limit of 5 μg/m3. India's neighboring countries, Bangladesh and Pakistan, also rank high on the list at first and fifth place respectively. This indicates that air pollution is a major issue in this region, with industrialization and lack of environmental regulations contributing to the problem. The data also shows that air pollution is a major health problem in emerging and developing countries, with millions of premature deaths attributed to it.
