In [1]:
from unstructured.partition.auto import partition
import os

def process_documents(directory_path):
    """
    Process all documents (DOCX, PDF, TXT) in a directory and combine their text content.
    
    Args:
        directory_path (str): Path to the directory containing documents
    
    Returns:
        list: Combined text data from all documents
    """
    supported_extensions = ('.docx', '.pdf', '.txt')
    combined_text = []
    
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(supported_extensions):
            file_path = os.path.join(directory_path, filename)
            try:
                # Auto-detect file type and partition
                elements = partition(file_path)
                # Extract clean text
                file_text = [element.text for element in elements if hasattr(element, 'text') and element.text.strip()]
                combined_text.extend(file_text)
                print(f"✅ Processed {filename} ({len(file_text)} elements)")
            except Exception as e:
                print(f"❌ Failed to process {filename}: {str(e)}")
    
    return combined_text

# Example usage:
documents_directory = "all_documents"
all_text_data = process_documents(documents_directory)

print(f"\nTotal text elements extracted: {len(all_text_data)}")
print("Sample elements:")
print(all_text_data[:3])  # Show first 3 elements

✅ Processed Breast Cancer Research Articles - NCI.docx (168 elements)
✅ Processed cancers-15-00321.docx (364 elements)
✅ Processed ijo-57-06-1245.docx (292 elements)

Total text elements extracted: 824
Sample elements:
['3/24/25, 9:53 AM\tBreast Cancer Research Articles - NCI', 'Breast Cancer Research Results and Study Updates', 'See Advances in Breast Cancer Research for an overview of recent']


In [4]:
len(all_text_data)

824

In [9]:
# Import functions / modules
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from IPython.display import display, Markdown

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

def prepare_from_combined_text(combined_text):
    """
    Process already-loaded text data into a FAISS index using PubMedBERT embeddings
    
    Args:
        combined_text (list): List of text strings from previous processing
    
    Returns:
        FAISS: Vector index of all documents
    """
    # Convert raw text to LangChain Documents
    documents = [Document(page_content=text) for text in combined_text if text.strip()]
    
    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,  # Slightly smaller chunks recommended for PubMedBERT
        chunk_overlap=100
    )
    chunks = text_splitter.split_documents(documents)
    
    # Initialize PubMedBERT embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="neuml/pubmedbert-base-embeddings",  # or 'cuda' if available
        encode_kwargs={'normalize_embeddings': True}
    )
    
    # Create FAISS index
    print(f"Creating FAISS index with {len(chunks)} chunks using PubMedBERT...")
    return FAISS.from_documents(chunks, embeddings)

# Example usage:
# faiss_index = prepare_from_combined_text(combined_text)

In [13]:
df=prepare_from_combined_text(combined_text=all_text_data)

Creating FAISS index with 832 chunks using PubMedBERT...


In [21]:
from IPython.display import Markdown, display
import google.generativeai as genai

def ask(df, query, k):
    # Retrieve relevant documents with scores
    docs_faiss = df.similarity_search_with_score(query, k=k)
    
    # Prepare context with citations
    context_parts = []
    source_mapping = {}
    
    for i, (doc, score) in enumerate(docs_faiss, 1):
        source = doc.metadata.get('source', f"Document {i}")  # Get filename from metadata
        context_parts.append(f"[[{i}]] {doc.page_content}")
        source_mapping[i] = (source, score)  # Store source and similarity score
    
    context_text = "\n\n".join(context_parts)
    
    # Enhanced prompt with citation instructions
    prompt = f"""
    Context information is below. Each section is marked with [[NUMBER]] citations.
    ---------------------
    {context_text}
    ---------------------
    Given the context, answer this question: {query}
    
    Requirements:
    1. If the information isn't in the context, say "I don't have that information"
    2. For any facts used, include [[NUMBER]] citations pointing to which document they came from
    3. Include a "Sources" section at the end listing all cited documents
    4. Keep the answer concise but accurate
    """
    
    # Generate response
    genai.configure(api_key="AIzaSyCXrPcd8h6o0LAcBTtqGDBqCfcLbmNg2-o")
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    
    # Format the output with sources
    generated_text = response.candidates[0].content.parts[0].text
    
    # Add detailed sources section
    sources_section = "\n\n## Sources\n"
    for num, (source, score) in source_mapping.items():
        if f"[[{num}]]" in generated_text:
            sources_section += f"- [[{num}]] {source} (similarity score: {score:.2f})\n"
    
    full_response = f"{generated_text}\n{sources_section}"
    
    return display(Markdown(full_response))

In [22]:
# Define the query
import google.generativeai as genai
query = """
What does TNBC stand for?

"""

In [23]:
ask(df,query,5)

TNBC stands for triple-negative breast cancer. [[4], [5]]

Sources:
[[2]] Kwa, M.J.; Adams, S. Checkpoint inhibitors in triple-negative breast cancer (TNBC): Where to go from here. Cancer 2018, 124, 2086–2103. [CrossRef]
[[3]] Findings from the TAILORx clinical trial show chemotherapy does not benefit most women with early breast cancer. The new data, released at the 2018 ASCO annual meeting, will help inform treatment decisions for many women with early-stage breast cancer.
[[4]] TNBC is the subtype of breast cancer with the worst prognosis. To date, although several targeted drugs have been approved for the treatment of TNBC, the urgent need for improved survival has not been met. The practice of immunotherapy in TNBC is just beginning to take off. An advantage of the later start in this field is that experience can be learned from other tumor types, both successful and failed. Although some progress has been made with respect to ICIs for TNBC, many challenges remain. Clinical results show that only a small proportion of patients with TNBC actually benefit from immunotherapy. Thus, identifying the target population and expanding the efficacy is a top priority. Overall, combination treatment is the way forward, but the combination treatment mode, sequence,
[[5]] Developing novel treatments in both early and advanced TNBC settings remains a significant unmet need. Recent advances with novel agents have been made for specific subgroups with PD‑L1+ tumors or gBRCAm tumors. However, only a fraction of those patients respond to immune check-point or PARP inhibitors, and even those who do respond often develop resistance and relapse. In diverse tumor microenvironments, a given therapeutic agent shows vari-able responses, thus compromising the survival endpoints especially in an unselected TNBC population. Therefore, developing novel predictive biomarkers are crucial for selecting patients that will benefit the most from a given therapy. Single cell technologies will provide additional insight on tumor‑stroma interactions and facilitate compelling rationale for new treatments based on novel biomarkers. A non‑invasive testing of plasma circulating tumor DNA (ctDNA) and CTCs can potentially provide real ‑time disease monitoring and even early therapy modification. However, their prognostic value needs further evaluation. With recent advances in multiomic analyses of cancers, there appears to be genomic and molecular similarities between TNBC and high ‑grade serous ovarian carcinoma (HGSOC), suggesting that similar biological mechanisms drive some aspects of both cancer types. Therefore, treatment strategies for HGSOC can be explored in TNBC as well. The recent increase in the number of clinical trials investigating various new agents and combination strategies reflects further efforts to understand molecular and immunological aspects of TNBC. This may lead to more meaningful clinical benefits, including event‑free and overall survival.
[[1]] The test, which helps guide treatment decisions, was not as good at predicting the risk of death from breast cancer for Black patients as for White patients, a new study has found. The ndings highlight the need for greater racial diversity in research studies.




## Sources
- [[1]] Document 1 (similarity score: 0.95)
- [[2]] Document 2 (similarity score: 0.97)
- [[3]] Document 3 (similarity score: 0.98)
- [[4]] Document 4 (similarity score: 1.05)
- [[5]] Document 5 (similarity score: 1.06)


In [27]:
from IPython.display import Markdown, display
import google.generativeai as genai

def ask(df, query, k=3, score_threshold=0.7):
    """
    Enhanced RAG function with:
    - PDF page number citations
    - Document titles in metadata
    - Score threshold filtering
    - Beautiful source attribution
    
    Args:
        df: FAISS vector store
        query: User question
        k: Number of docs to retrieve
        score_threshold: Minimum similarity score (0-1)
    """
    # Retrieve documents with scores
    docs_faiss = df.similarity_search_with_score(query, k=k)
    
    # Filter by score threshold and prepare context
    context_parts = []
    source_mapping = {}
    valid_docs = []
    
    for i, (doc, score) in enumerate(docs_faiss, 1):
        if score < score_threshold:
            continue
            
        valid_docs.append((doc, score))
        
        # Extract metadata with fallbacks
        metadata = doc.metadata
        title = metadata.get('title', "Untitled Document")
        source = metadata.get('source', "Unknown Source")
        page = metadata.get('page', "N/A")
        
        # Create citation label
        citation_id = f"[Doc{i}]"
        context_parts.append(f"{citation_id}\n{doc.page_content}")
        
        # Store source info
        source_mapping[citation_id] = {
            'title': title,
            'source': source,
            'page': page,
            'score': f"{score:.2f}"
        }
    
    if not valid_docs:
        return display(Markdown("🔍 No relevant documents found above similarity threshold."))
    
    context_text = "\n\n---\n\n".join(context_parts)
    
    # Enhanced prompt
    prompt = f"""You're a research assistant. Answer using ONLY these verified sources:

{context_text}

Question: {query}

Guidelines:
1. ALWAYS cite sources like this: [Doc1]
2. For page references: [Doc1, p.3]
3. If unsure, say "I couldn't find definitive evidence"
4. Include ALL relevant citations
5. Structure your answer clearly

Final Answer:"""
    
    # Generate response
    genai.configure(api_key="AIzaSyCXrPcd8h6o0LAcBTtqGDBqCfcLbmNg2-o")
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    generated_text = response.text
    
    # Build beautiful sources section
    sources_section = "\n\n### References\n"
    for citation_id, info in source_mapping.items():
        if citation_id in generated_text:
            sources_section += (
                f"- {citation_id}: {info['title']}\n"
                f"  - Source: {info['source']}\n"
                f"  - Page: {info['page']}\n"
                f"  - Similarity: {info['score']}\n\n"
            )
    
    # Combine everything
    full_response = f"{generated_text}\n{sources_section}"
    return display(Markdown(full_response))

In [28]:
ask(df,query,k=3)

I couldn't find definitive evidence for what TNBC stands for in the provided source documents.  While Doc2 mentions "triple-negative breast cancer (TNBC)", it does not provide the full meaning of the acronym.



### References
