In [None]:
!pip install PyPDF2 python-dotenv pinecone  google-generativeai sentence-transformers

In [3]:
from PyPDF2 import PdfReader

In [4]:
import PyPDF2
import os
def read_file(file_path: str):
    """Read content from a PDF file"""
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

In [5]:
def read_document(file_path: str):
    """Read document content based on file extension"""
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()
    if file_extension == '.pdf':
        return read_file(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

In [6]:
def split_text(text: str, chunk_size: int = 500):
    """Split text into chunks while preserving sentence boundaries"""
    sentences = text.replace('\n', ' ').split('. ')
    chunks = []
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # Ensure proper sentence ending
        if not sentence.endswith('.'):
            sentence += '.'

        sentence_size = len(sentence)

        # Check if adding this sentence would exceed chunk size
        if current_size + sentence_size > chunk_size and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_size = sentence_size
        else:
            current_chunk.append(sentence)
            current_size += sentence_size

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [73]:
from google.colab import userdata
userdata.get('groq_api_key')

## Load the GROQ and GOOGLE API keys
groq_api_key = userdata.get('groq_api_key')
google_api_key = userdata.get('GOOGLE_API_KEY')
pine_api_key = userdata.get('pine_api_key')

In [10]:
import os
import pinecone
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone,ServerlessSpec

pc = Pinecone(api_key=pine_api_key)

# Create or connect to an existing Pinecone index
index_name = "testindex"
# Get a fresh list of indexes.
indexes = pc.list_indexes()

# Convert the IndexList object to a list of index names for comparison
index_names = [index.name for index in indexes]

# Check if the index name exists in the list of index names
if index_name not in index_names:
    pc.create_index(name=index_name, dimension=384,spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ),)  # Dimension depends on embedding model used
else:
    print(f"Index '{index_name}' already exists.")

index = pc.Index(index_name)

# Configure Sentence Transformer embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

Index 'testindex' already exists.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [26]:
def process_document(file_path: str):
    """Process a single document and prepare it for Pinecone"""
    try:
        # Read the document
        content = read_document(file_path)

        # Split into chunks
        chunks = split_text(content)

        # Prepare metadata
        file_name = os.path.basename(file_path)
        # Include the 'text' key in the metadata
        metadatas = [{"source": file_name, "chunk": i, "text": chunk} for i, chunk in enumerate(chunks)]
        ids = [f"{file_name}_chunk_{i}" for i in range(len(chunks))]

        return ids, chunks, metadatas
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return [], [], []

In [27]:
def add_to_pinecone(index, ids, texts, metadatas):
    """Add documents to Pinecone in batches"""
    if not texts:
        return

    batch_size = 100
    for i in range(0, len(texts), batch_size):
        end_idx = min(i + batch_size, len(texts))

        # Generate embeddings for the batch
        embeddings = model.encode(texts[i:end_idx]).tolist()

        # Prepare data for upsertion
        # Ensure that j stays within the bounds of all three lists
        pinecone_data = [
            (ids[i + k], embeddings[k], metadatas[i + k])
            for k in range(end_idx - i)  # Iterate using k within the current batch
        ]
        index.upsert(vectors=pinecone_data)

In [28]:
def process_and_add_documents_to_pinecone(index, file_path: str):
    """Process a single document and add to Pinecone index"""
    print(f"Processing {os.path.basename(file_path)}...")
    ids, texts, metadatas = process_document(file_path)
    add_to_pinecone(index, ids, texts, metadatas)
    print(f"Added {len(texts)} chunks to Pinecone index")

In [29]:
folder_path = "/content/budget_speech.pdf"
process_and_add_documents_to_pinecone(index, folder_path)

Processing budget_speech.pdf...
Added 209 chunks to Pinecone index


In [32]:
def semantic_search(index, query: str, n_results: int = 2):
    """Perform semantic search on the Pinecone index"""
    # Generate embedding for the query
    query_embedding = model.encode([query]).tolist()[0]

    # Perform the search
    results = index.query(
        vector=query_embedding,
        top_k=n_results,
        include_metadata=True
    )
    return results


In [58]:
def get_context_with_sources(results):
    """Extract context and source information from Pinecone search results"""
    # Combine document chunks into a single context
    # Accessing the text using the correct key from metadata
    context = "\n\n".join([match['metadata']['text'] for match in results['matches']])

    # Format sources with metadata
    sources = [
        f"{match['metadata']['source']} (chunk {match['metadata']['chunk']})"
        for match in results['matches']
    ]

    return context, sources

In [59]:
def print_search_results(results):
    """Print formatted Pinecone search results"""
    print("\nSearch Results:\n" + "-" * 50)

    for i, match in enumerate(results['matches']):
        # Accessing the text using the correct key from metadata
        doc = match['metadata']['text']
        meta = match['metadata']
        score = match['score']

        print(f"\nResult {i + 1}")
        print(f"Source: {meta['source']}, Chunk {meta['chunk']}")
        print(f"Score: {score}")
        print(f"Content: {doc}\n")

In [60]:
query = "How much the agriculture target will be increased by how many crore?"
results = semantic_search(index, query)

# Print formatted results
print_search_results(results)

# Extract context and sources
context, sources = get_context_with_sources(results)
print("\nContext:\n", context)
print("\nSources:\n", "\n".join(sources))


Search Results:
--------------------------------------------------

Result 1
Source: budget_speech.pdf, Chunk 73.0
Score: 0.555776358
Content: For collection of bio-mass and  distribution of bio-manure, appropriate fiscal support will be provided. 19       Bhartiya Prakritik Kheti Bio-Input Resource Centres     84. Over the next 3 years, we will facilitate 1 crore farmers to adopt  natural farming. For this, 10,000 Bio-Input Resource Centres will be set-up,  creating a national-level distributed micro-fertilizer and pesticide  manufacturing network. MISHTI  85.


Result 2
Source: budget_speech.pdf, Chunk 27.0
Score: 0.550094903
Content: Now to make India a global hub for ' Shree Anna' , the Indian Institute  of Millet Research, Hyderabad  will be supported as the Centre of Excellence  for sharing best practices, research and technologies at the international  level. Agriculture Credit   23. The agriculture credit target will be increased   to ` 20 lakh crore with focus on animal husba

In [61]:
!pip install groq



In [62]:
from groq import Groq

client = Groq(api_key=groq_api_key)

In [63]:
def get_prompt(context: str, query: str):
    """Generate a prompt combining context, history, and query"""
    prompt = f"""Based on the following context and conversation history,
    please provide a relevant and contextual response. If the answer cannot
    be derived from the context, only use the conversation history or say
    "I cannot answer this based on the provided information."

    Context from documents:
    {context}

    Human: {query}

    Assistant:"""

    return prompt


In [64]:
def generate_response(query: str, context: str, conversation_history: str = ""):
    """Generate a response using OpenAI with conversation history"""
    prompt = get_prompt(context, conversation_history, query)

    try:
        response = client.chat.completions.create(
            model="gpt-4",  # or gpt-3.5-turbo for lower cost
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,  # Lower temperature for more focused responses
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error generating response: {str(e)}"


In [70]:
def generate_response(query: str, context: str):
    """Generate a response using Groq's Llama-3.1-8b-instant with conversation history"""
    # Construct the prompt
    prompt = get_prompt(context, query)

    try:
        # Create the chat completion request
        completion = client.chat.completions.create(
            model="llama-3.1-8b-instant",  # Using Groq's model
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
                {"role": "user", "content": prompt}
            ],
            temperature=1,  # Adjust temperature for creativity
            max_tokens=1024,  # Maximum tokens for the response
            top_p=1,  # Nucleus sampling
            stream=True  # Enable streaming for response
        )

        # Process and return the streamed response
        response_content = []
        for chunk in completion:
              content = chunk.choices[0].delta.content or ""
              response_content.append(content)

          # Combine the response into a single string
        response = "".join(response_content)
        return response

    except Exception as e:
        return f"Error generating response: {str(e)}"


In [71]:
def rag_query(pinecone_index, query: str, n_chunks: int = 2):
    """Perform RAG query: retrieve relevant chunks and generate an answer using Pinecone."""
    # Retrieve relevant chunks using Pinecone
    results = semantic_search(pinecone_index, query, n_chunks)
    context, sources = get_context_with_sources(results)

    # Generate response using Groq
    response = generate_response(query, context)

    return response, sources

In [72]:
query = "How much the agriculture target will be increased by how many crore?"
response, sources = rag_query(index, query)

# Print results
print("\nQuery:", query)
print("\nAnswer:", response)
print("\nSources used:")
for source in sources:
    print(f"- {source}")



Query: How much the agriculture target will be increased by how many crore?

Answer: The agriculture credit target will be increased to ₹ 20 lakh crore.

Sources used:
- budget_speech.pdf (chunk 73.0)
- budget_speech.pdf (chunk 27.0)
