In [2]:
import os
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pinecone import Pinecone, ServerlessSpec
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Step 1: Set up environment variables
PINECONE_API_KEY = 'your-pinecone-api-key'

# Step 2: Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Step 3: Create or connect to a Pinecone index
index_name = "pdf-store-open-source"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Sentence Transformer 'all-MiniLM-L6-v2' embedding dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-west-2")
    )

# Step 4: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf = PdfReader(file)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

pdf_path = "path/to/your/document.pdf"
raw_text = extract_text_from_pdf(pdf_path)

# Step 5: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
texts = text_splitter.split_text(raw_text)

# Step 6: Create Document objects
documents = [Document(page_content=t) for t in texts]

# Step 7: Initialize Sentence Transformer embeddings
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 8: Create embeddings and store in Pinecone
index = pc.Index(index_name)
for i, doc in enumerate(documents):
    embedding = embeddings_model.encode(doc.page_content).tolist()
    index.upsert(vectors=[(str(i), embedding, {"text": doc.page_content})])

print(f"Successfully stored {len(documents)} document chunks in Pinecone index '{index_name}'")

# Step 9: Load FLAN-T5 model locally
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Step 10: Perform a similarity search and generate response
def query_and_respond(query, k=3):
    query_embedding = embeddings_model.encode(query).tolist()
    results = index.query(vector=query_embedding, top_k=k, include_metadata=True)
    
    context = " ".join([match['metadata']['text'] for match in results['matches']])
    
    prompt = f"Answer the following question based on this context: {context}\n\nQuestion: {query}\n\nAnswer:"
    
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=150, num_return_sequences=1, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Example usage
query = "What is the main topic of this document?"
answer = query_and_respond(query)
print(f"Query: {query}")
print(f"Answer: {answer}")

Number of documents: 31
Number of document chunks: 265
base_url='http://localhost:11434' model='llama2' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


ValueError: Please set PINECONE_API_KEY and PINECONE_API_ENV environment variables.