In [24]:
from pinecone import Pinecone
import openai

import tiktoken  # To properly count tokens
from typing import List

import fitz  # PyMuPDF

import time

import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(project_root)
import keys

In [6]:
os.environ["OPENAI_API_KEY"] = keys.OPEN_AI_API_KEY

In [12]:
# Initialize Pinecone
pc = Pinecone(api_key=keys.PINECONE_KEY)

# Connect to your existing index
index = pc.Index("rag-demo")

In [9]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n\n"  # Extract text page by page
    return text

In [11]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def tokenize_text(text: str) -> List[str]:
    """Tokenizes a text and returns a list of tokens."""
    return tokenizer.encode(text)

def chunk_text(text: str, chunk_size=300, overlap=50) -> List[str]:
    """Splits a document into overlapping chunks."""
    tokens = tokenize_text(text)
    chunks = []
    
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(tokenizer.decode(chunk))  # Convert tokens back to text
    
    return chunks

# def load_and_chunk_docs(folder_path: str) -> List[str]:
#     """Loads all text files from a folder, chunks them, and returns a list of chunks."""
#     text_chunks = []
    
#     for filename in os.listdir(folder_path):
#         if filename.endswith(".txt"):  # Ensure we're only processing text files
#             with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
#                 text = file.read()
#                 chunks = chunk_text(text)  # Apply chunking
#                 text_chunks.extend(chunks)  # Store all chunks
                
#     return text_chunks

def load_and_chunk_docs(folder_path: str):
    """Loads and chunks text from PDF or TXT files in the docs folder."""
    text_chunks = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        if filename.endswith(".pdf"):  # Process PDF
            print(f"📖 Extracting text from PDF: {filename}")
            text = extract_text_from_pdf(file_path)
        elif filename.endswith(".txt"):  # Process TXT
            print(f"📄 Loading text file: {filename}")
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()
        else:
            continue  # Skip other file types
        
        # Apply chunking to the extracted text
        chunks = chunk_text(text)
        text_chunks.extend(chunks)

    return text_chunks

# Load and process all .txt files in the "docs" folder
folder_path = "docs"
text_chunks = load_and_chunk_docs(folder_path)

print(f"📄 Processed {len(text_chunks)} text chunks from {len(os.listdir(folder_path))} documents.")

📖 Extracting text from PDF: 32624-pdf.pdf
📄 Processed 1067 text chunks from 1 documents.


In [34]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n\n"  # Extract text page by page
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    """Splits text into overlapping chunks."""
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = " ".join(tokens[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

pdf_path = "docs/32624-pdf.pdf"

# Extract and chunk text
full_text = extract_text_from_pdf(pdf_path)
text_chunks = chunk_text(full_text)

documents = [{"id": str(i), "text": chunk} for i, chunk in enumerate(text_chunks)]

# Upsert into Pinecone
#index.upsert_records(records=documents, namespace="defualt")

# Upsert in batches of 50 to avoid API rate limits
BATCH_SIZE = 50
for i in range(0, len(documents), BATCH_SIZE):
    batch = documents[i : i + BATCH_SIZE]
    index.upsert_records(records=batch, namespace="default")
    
    print(f"✅ Uploaded batch {i//BATCH_SIZE + 1}/{(len(documents)//BATCH_SIZE) + 1}")
    
    # Add a delay to prevent exceeding Pinecone's rate limit
    time.sleep(10)  # Wait 10 seconds between batches


print("✅ Successfully upserted PDF content into Pinecone!")

✅ Uploaded batch 1/5
✅ Uploaded batch 2/5
✅ Uploaded batch 3/5
✅ Uploaded batch 4/5
✅ Uploaded batch 5/5
✅ Successfully upserted PDF content into Pinecone!


In [53]:
def search_pinecone(query_text, top_k=5):
    """Search Pinecone using Integrated Inference (automatic embeddings)."""
    
    ranked_results = index.search_records(
        namespace="default", 
        query={
            "inputs": {"text": query_text},  # Pinecone automatically embeds this
            "top_k": top_k 
        },
        # rerank={
        #     "model": "bge-reranker-v2-m3",  # Uses reranking for better results
        #     "top_n": 3,  # Number of top-ranked results after reranking
        #     "rank_fields": ["text"]  # Ensure we rerank based on text field
        # },
        fields=["text"]  # Retrieve the text field from results
    )
    
    return ranked_results

# Example usage
query_text = "Who were the main figures in THE EXPANSION OF ROME TO THE UNIFICATION OF THE ITALIAN PENINSULA in 265 B. C."
results = search_pinecone(query_text)

#print(results)

for result in results["result"]["hits"]:
    print(f"🔹 Score: {result['_score']}")
    print(f"📄 Text: {result['fields']['text']}\n")


🔹 Score: 0.4572366774082184
📄 Text: The Project Gutenberg EBook of A History of Rome to 565 A. D. by Arthur Edward Romilly Boak This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at http://www.gutenberg.org/license Title: A History of Rome to 565 A. D. Author: Arthur Edward Romilly Boak Release Date: May 31, 2010 [Ebook 32624] Language: English ***START OF THE PROJECT GUTENBERG EBOOK A HISTORY OF ROME TO 565 A. D.*** A HISTORY OF ROME TO 565 A. D. BY ARTHUR E. R. BOAK, Ph. D., Professor of Ancient History in the University of Michigan v New York THE MACMILLAN COMPANY 1921 All rights reserved COPYRIGHT, 1921. By THE MACMILLAN COMPANY. Set up and electrotyped. Published December, 1921. vii PRINTED IN THE UNITED STATES OF AMERICA [v] PREFACE This sketch of the History of Rome to 565 A. D. is primarily intended to me

In [33]:
index.delete(namespace="default", delete_all=True)
print("✅ All documents deleted from Pinecone!")

✅ All documents deleted from Pinecone!


In [57]:
def answer_question_with_chatgpt(question):
    """Retrieves relevant text from Pinecone and generates an answer with ChatGPT."""

    # Retrieve context from Pinecone
    search_results = search_pinecone(question)

    # Extract relevant text chunks
    context = "\n\n".join([result["fields"]["text"] for result in search_results["result"]["hits"]])

    # Construct the prompt for ChatGPT
    prompt = f"""
    Based on the following information, which was extracted from a History book pdf but retrieved using embeddings, answer the question:

    {context}

    Question: {question}
    Answer:
    """

    client = openai.OpenAI()
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "You are an intelligent assistant who will get history info and try to explain it in a comprehensible way."},
                  {"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content.strip()


In [58]:
question = "Who were the main figures in THE EXPANSION OF ROME TO THE UNIFICATION OF THE ITALIAN PENINSULA in 265 B. C.?"
answer = answer_question_with_chatgpt(question)
print("🤖 Answer:", answer)

🤖 Answer: The main figures in the expansion of Rome to the unification of the Italian Peninsula in 265 B.C. are not explicitly mentioned in the extracted text. However, based on the timeline mentioned, Roman consuls would have played a major role in its expansion during this period. The text does reference particular regions (Such as the Etruscans, Latins, and the Greeks) and groups (like the Latin League, Samnites, Gauls, and Etruscans), indicating the interactions Rome had with them during its expansion, but specific individuals are not named.
