In [1]:
# Install Pinecone client
!pip install pinecone-client

# Install SentenceTransformer for embedding generation
!pip install sentence-transformers

# Install Google Generative AI (Gemini-pro) client
!pip install google-generativeai

# Install pdfplumber for extracting text from PDFs
!pip install pdfplumber



Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0

In [2]:
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import pdfplumber

In [3]:
# Configure Google Gemini-pro API
genai.configure(api_key="your_google_api_key")    # Replace with your actual api key

# Initialize Pinecone
pc = Pinecone(api_key="your_pinecone_api_key")    # Replace with your actual api key

In [4]:
index_name = "colab"             # Replace with your actual pinecone index name
index = pc.Index(index_name)

# Load embedding model
embedder = SentenceTransformer('all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

In [6]:
# Function to store document in Pinecone
def store_document_in_pinecone(doc_text):
    sentences = doc_text.split('. ')
    embeddings = embedder.encode(sentences)
    for i, emb in enumerate(embeddings):
        index.upsert([(f"sentence-{i}", emb, {'text': sentences[i]})])

In [7]:
# Function to retrieve relevant chunks from Pinecone
def retrieve_relevant_chunks(query, top_k=5):
    query_embedding = embedder.encode([query])
    results = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
    relevant_chunks = [match['metadata']['text'] for match in results['matches']]
    return " ".join(relevant_chunks)

In [25]:
# Function to get an answer using Gemini-pro
def question_text(retrieved_text, question):
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content([f"Answer the following question:\n\nText: {retrieved_text}\n\nQuestion: {question}"])

    # Clean the generated response by removing asterisks
    cleaned_response = response.text.replace('*', '').replace('**', '')  # Remove asterisks

    return cleaned_response

In [26]:
# Load a PDF and store it in Pinecone
pdf_path = "/content/BlackHoles.pdf"    #Replace with you pdf file path
doc_text = extract_text_from_pdf(pdf_path)
store_document_in_pinecone(doc_text)


In [28]:
# Ask a question
question = "Who is Schwarchild?"
retrieved_text = retrieve_relevant_chunks(question)
answer = question_text(retrieved_text, question)
print("Retrieved Text:\n", retrieved_text)
print("\n\nGenerated Answer:\n", answer)


Retrieved Text:
 Hamilton]Karl Schwarzschild’s Work
In 1916 Schwarzschild read Einstein’s
paper on general relativity He was
interested in the physics of stars,
and had a lot of spare time between
battles on the Russian front, so he solved
Einstein’s field equation for the region
outside a massive spherical object.
His solution had many interesting features,
including
q prediction of space warping in strong
gravity, and invention of embedding
diagrams to visualize it.
q verification gravitational time dilation,
just as Einstein had pictured it.
q prediction of black holes, though this
[slide courtesy of D Watson]
was not recognized at the time.Schwarzschild’s solution
Describes the spacetime curvature near a massive, spherically
symmetric body Begelman & M Watson - Image from Thorne’s “Black Holes and time Warps”]


Generated Answer:
 The text states that Karl Schwarzschild is the person being discussed. 

