In [49]:
PINECONE_API_KEY="pcsk_69GeMY_KqUsphtXM5m7bR9SPQW6tf7b2wLmbHG15WoW7THKuyP3n4b611J8Q1VyomEJdcm"
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "document-search"
if index_name not in [idx["name"] for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1024,  # Adjust based on embedding model
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [50]:

index = pc.Index(index_name)


In [51]:
index

<pinecone.data.index.Index at 0x279c61d1f30>

In [40]:
import os
import re
from unstructured.partition.auto import partition
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

def process_documents(directory_path):
    """
    Process all documents (DOCX, PDF, TXT) in a directory and combine their text content.
    
    Args:
        directory_path (str): Path to the directory containing documents
    
    Returns:
        list: List of tuples (filename, extracted text)
    """
    supported_extensions = ('.docx', '.pdf', '.txt')
    combined_text = []
    
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(supported_extensions):
            file_path = os.path.join(directory_path, filename)
            try:
                # Auto-detect file type and partition
                elements = partition(file_path)
                # Extract clean text
                file_text = [element.text for element in elements if hasattr(element, 'text') and element.text.strip()]
                
                for text in file_text:
                    cleaned_text = preprocess_text(text)  # Apply preprocessing
                    combined_text.append((filename, cleaned_text))  # Store filename with cleaned text
                
                print(f"✅ Processed {filename} ({len(file_text)} elements)")
            except Exception as e:
                print(f"❌ Failed to process {filename}: {str(e)}")
     
    return combined_text

def remove_copyright(text):
    """ Remove common copyright notices & disclaimers """
    copyright_patterns = [
        r"©\s?\d{4}\s?.*?",  # Example: "© 2024 XYZ Corp."
        r"all rights reserved",
        r"this document is copyrighted by .*?",
        r"no part of this publication may be reproduced.*?",
        r"for informational purposes only.*?",
        r"unauthorized duplication is prohibited.*?"
    ]
    for pattern in copyright_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    return text.strip()

def preprocess_text(text):
    """ Lowercase, remove extra spaces, remove special characters, and clean text """
    text = text.lower()  
    text = remove_copyright(text)
    text = re.sub(r'\s+', ' ', text)  
    return text.strip()

# Define semantic chunking strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,  
    chunk_overlap=50,
    separators=["\n\n", ".", "!", "?", ";"]
)

# Load embedding model
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")


In [34]:
processed_data=process_documents("all_documents/")

✅ Processed Breast Cancer Research Articles - NCI.docx (168 elements)
✅ Processed cancers-15-00321.docx (364 elements)
✅ Processed ijo-57-06-1245.docx (292 elements)


In [47]:
def store_in_pinecone(text_chunks, batch_size=10):
    """
    Store processed text chunks into Pinecone in smaller batches while ensuring payloads are within limits.
    """
    embeddings = embedding_model.embed_documents(text_chunks)
    
    vectors = []
    for i, (text, embedding) in enumerate(zip(text_chunks, embeddings)):
        vectors.append({
            "id": f"vec{i}",
            "values": embedding,
            "metadata": {"text": text[:200]}  # Trim long text in metadata
        })

    # Upsert in smaller batches to stay under the 4MB limit
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        try:
            index.upsert(vectors=batch, namespace="ns1")
            print(f"✅ Upserted batch {i // batch_size + 1}")
        except Exception as e:
            print(f"❌ Failed to upsert batch {i // batch_size + 1}: {e}")

    print("✅ Successfully stored in Pinecone.")


In [52]:
all_texts = [text for _, text in processed_data]
text_chunks = []
for text in all_texts:
        text_chunks.extend(text_splitter.split_text(text))
    
store_in_pinecone(text_chunks)

✅ Upserted batch 1
✅ Upserted batch 2
✅ Upserted batch 3
✅ Upserted batch 4
✅ Upserted batch 5
✅ Upserted batch 6
✅ Upserted batch 7
✅ Upserted batch 8
✅ Upserted batch 9
✅ Upserted batch 10
✅ Upserted batch 11
✅ Upserted batch 12
✅ Upserted batch 13
✅ Upserted batch 14
✅ Upserted batch 15
✅ Upserted batch 16
✅ Upserted batch 17
✅ Upserted batch 18
✅ Upserted batch 19
✅ Upserted batch 20
✅ Upserted batch 21
✅ Upserted batch 22
✅ Upserted batch 23
✅ Upserted batch 24
✅ Upserted batch 25
✅ Upserted batch 26
✅ Upserted batch 27
✅ Upserted batch 28
✅ Upserted batch 29
✅ Upserted batch 30
✅ Upserted batch 31
✅ Upserted batch 32
✅ Upserted batch 33
✅ Upserted batch 34
✅ Upserted batch 35
✅ Upserted batch 36
✅ Upserted batch 37
✅ Upserted batch 38
✅ Upserted batch 39
✅ Upserted batch 40
✅ Upserted batch 41
✅ Upserted batch 42
✅ Upserted batch 43
✅ Upserted batch 44
✅ Upserted batch 45
✅ Upserted batch 46
✅ Upserted batch 47
✅ Upserted batch 48
✅ Upserted batch 49
✅ Upserted batch 50
✅ Upserte

In [58]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'ns1': {'vector_count': 1078}},
 'total_vector_count': 1078,
 'vector_type': 'dense'}


In [61]:
query = "what defines triple negative breast cancer (tnbc) at the immunohistochemical level?"

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)


In [62]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)



{'matches': [{'id': 'vec40',
              'metadata': {'text': 'nci researchers have shown that an '
                                   'experimental form of immunotherapy that '
                                   'uses an individual’s own tumor- ghting '
                                   'immune cells could potentially be used to '
                                   'treat people with metastatic breast cancer '
                                   'w'},
              'score': 0.0485979021,
              'values': []},
             {'id': 'vec753',
              'metadata': {'text': '. the incite phase ii trial (nct03971409) '
                                   'also includes a two ‑week induction of '
                                   'binimetinib (mek inhibitor), utomilumab '
                                   '(4‑1bb agonist), or pf‑ 04518600 '
                                   '(anti‑ox40 antibody) which may help '
                                   'activate t'},
              '

In [66]:
from IPython.display import Markdown, display
import google.generativeai as genai
import pinecone
import json

def ask_pinecone(index, query, k):
    """
    Retrieve relevant documents from Pinecone, generate a response with citations,
    and return data in two parts:
    1. Display answer and sources in Markdown format
    2. Return JSON containing query, retrieved Pinecone content, and LLM-generated answer
    
    Args:
        index (pinecone.Index): Pinecone vector index
        query (str): User's question
        k (int): Number of top results to retrieve
    
    Returns:
        dict: JSON containing query, retrieved content from Pinecone, and LLM-generated answer
    """
    # Convert query into an embedding
     # Replace with actual embedding model
    embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
    )
    # Search in Pinecone
    results = index.query(
        namespace="ns1",  # Change if using a different namespace
        vector=embedding,
        top_k=k,
        include_values=False,
        include_metadata=True  # Get stored metadata
    )

    # Prepare context with citations and metadata
    context_parts = []
    source_mapping = {}
    retrieved_content = []

    for i, match in enumerate(results['matches'], 1):
        metadata = match['metadata']
        source = metadata.get('source_filename', f"Document {i}")  # Get filename from metadata
        chunk_id = metadata.get('chunk_id', 'Unknown')  # Retrieve chunk ID
        score = match['score']  # Similarity score

        # Format context with citation
        context_parts.append(f"[[{i}]] {metadata['text']}")
        source_mapping[i] = {
            "source": source,
            "chunk_id": chunk_id,
            "score": float(score)  # Ensure JSON serialization
        }
        retrieved_content.append({
            "source_filename": source,
            "chunk_id": chunk_id,
            "page_content": metadata['text'],
            "similarity_score": float(score)
        })

    context_text = "\n\n".join(context_parts)

    # Construct prompt with citations
    prompt = f"""
    Context information is below. Each section is marked with [[NUMBER]] citations.
    ---------------------
    {context_text}
    ---------------------
    Given the context, answer this question: {query}
    
    Requirements:
    1. If the information isn't in the context, say "I don't have that information"
    2. For any facts used, include [[NUMBER]] citations pointing to which document they came from
    3. Include a "Sources" section at the end listing all cited documents with chunk IDs
    4. Keep the answer concise but accurate
    """

    # Generate response using Gemini
    genai.configure(api_key="AIzaSyAYy9e5qAvCyytku6ardpMywXhRNjkfLRs")  # Replace with actual API key
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)

    # Extract generated text
    generated_text = response.candidates[0].content.parts[0].text

    # Build sources section
    sources_section = "\n\n## Sources\n"
    cited_sources = []
    for num, metadata in source_mapping.items():
        if f"[[{num}]]" in generated_text:
            source_entry = {
                "citation": f"[[{num}]]",
                "source_filename": metadata["source"],
                "chunk_id": metadata["chunk_id"],
                "similarity_score": metadata["score"]
            }
            cited_sources.append(source_entry)
            sources_section += f"- [[{num}]] {metadata['source']} (Chunk ID: {metadata['chunk_id']}, Similarity Score: {metadata['score']:.2f})\n"

    full_response = f"Answer: {generated_text}\n{sources_section}"

    # Part 1: Display response in Markdown
    display(Markdown(full_response))

    # Part 2: Return JSON output
    output_json = {
        "query": query,
        "retrieved_content": retrieved_content,  # Raw Pinecone results
        "generated_response": generated_text
    }

    return json.dumps(output_json, indent=4)


In [67]:
ask_pinecone(index,query,k=5)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 27 Mar 2025 12:40:35 GMT', 'Content-Type': 'text/plain', 'Content-Length': '68', 'Connection': 'keep-alive', 'server': 'envoy'})
HTTP response body: : invalid value Starting an object on a scalar field for type vector
