# processing the documents 

In [1]:
from unstructured.partition.auto import partition
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

def process_documents(directory_path):
    """
    Process all documents (DOCX, PDF, TXT) in a directory and combine their text content.
    
    Args:
        directory_path (str): Path to the directory containing documents
    
    Returns:
        list: List of tuples (filename, extracted text)
    """
    supported_extensions = ('.docx', '.pdf', '.txt')
    combined_text = []
    
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(supported_extensions):
            file_path = os.path.join(directory_path, filename)
            try:
                # Auto-detect file type and partition
                elements = partition(file_path)
                # Extract clean text
                file_text = [element.text for element in elements if hasattr(element, 'text') and element.text.strip()]
                
                for text in file_text:
                    combined_text.append((filename, text))  # Store filename with text
                
                print(f"✅ Processed {filename} ({len(file_text)} elements)")
            except Exception as e:
                print(f"❌ Failed to process {filename}: {str(e)}")
    
    return combined_text



# Preprocessing the data and adding to faiss vector database


In [3]:

def prepare_from_combined_text(combined_text):
    """
    Process text data into a FAISS index using PubMedBERT embeddings with metadata.
    
    Args:
        combined_text (list): List of tuples (filename, text) from previous processing
    
    Returns:
        FAISS: Vector index of all documents
    """
    # Convert raw text to LangChain Documents with metadata
    documents = [Document(page_content=text, metadata={"source_filename": filename}) for filename, text in combined_text if text.strip()]
    
    # Split the text into chunks with metadata
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Slightly smaller chunks recommended for PubMedBERT
        chunk_overlap=100
    )
    chunks = []
    for doc in documents:
        split_chunks = text_splitter.split_documents([doc])
        for i, chunk in enumerate(split_chunks):
            chunk.metadata["chunk_id"] = i  # Add chunk ID for tracking
        chunks.extend(split_chunks)
    
    # Initialize PubMedBERT embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="neuml/pubmedbert-base-embeddings",
        encode_kwargs={'normalize_embeddings': True}
    )
    
    # Create FAISS index
    print(f"Creating FAISS index with {len(chunks)} chunks using PubMedBERT...")
    return FAISS.from_documents(chunks, embeddings)

In [66]:
def check(word="TNBC",list= combined_text):
    if word in list:
        print("The word is in the list!")
    else:
        print("The word is not in the list!")

In [63]:
combined_text

[('Breast Cancer Research Articles - NCI.docx',
  '3/24/25, 9:53 AM\tBreast Cancer Research Articles - NCI'),
 ('Breast Cancer Research Articles - NCI.docx',
  'Breast Cancer Research Results and Study Updates'),
 ('Breast Cancer Research Articles - NCI.docx',
  'See Advances in Breast Cancer Research for an overview of recent'),
 ('Breast Cancer Research Articles - NCI.docx',
  'ndings and progress, plus ongoing projects supported by NCI.'),
 ('Breast Cancer Research Articles - NCI.docx',
  'Some Women Avoid Breast Cancer Screening After False-Positive Mammogram Results'),
 ('Breast Cancer Research Articles - NCI.docx', 'Posted: October 4, 2024'),
 ('Breast Cancer Research Articles - NCI.docx',
  'Some women who receive a false-positive result on a mammogram may not come back for routine breast cancer screening in the future, a new study nds. Better doctor–patient communication about the screening process is needed, several researchers said.'),
 ('Breast Cancer Research Articles - NCI

# usage of functions



In [9]:
# Example usage:
documents_directory = "all_documents/"
combined_text = process_documents(documents_directory)
faiss_index = prepare_from_combined_text(combined_text)

✅ Processed Breast Cancer Research Articles - NCI.docx (168 elements)
✅ Processed cancers-15-00321.docx (364 elements)
✅ Processed ijo-57-06-1245.docx (292 elements)
Creating FAISS index with 876 chunks using PubMedBERT...


# ask and query

In [10]:
from IPython.display import Markdown, display
import google.generativeai as genai
import json

def ask(faiss_index, query, k):
    """
    Retrieve relevant documents from FAISS, generate a response with citations, and return data in two parts:
    1. Display answer and sources in Markdown format
    2. Return JSON containing query, retrieved FAISS content, and LLM-generated answer
    
    Args:
        faiss_index (FAISS): FAISS vector index
        query (str): User's question
        k (int): Number of top results to retrieve
    
    Returns:
        dict: JSON containing query, retrieved content from FAISS, and LLM-generated answer
    """
    # Retrieve relevant documents with scores
    docs_faiss = faiss_index.similarity_search_with_score(query, k=k)
    
    # Prepare context with citations and FAISS retrieval data
    context_parts = []
    source_mapping = {}
    retrieved_content = []
    
    for i, (doc, score) in enumerate(docs_faiss, 1):
        source = doc.metadata.get('source_filename', f"Document {i}")  # Get filename from metadata
        chunk_id = doc.metadata.get('chunk_id', 'Unknown')  # Retrieve chunk ID
        context_parts.append(f"[[{i}]] {doc.page_content}")
        source_mapping[i] = {"source": source, "chunk_id": chunk_id, "score": float(score)}  # Ensure score is JSON serializable
        retrieved_content.append({"source_filename": source, "chunk_id": chunk_id, "page_content": doc.page_content, "similarity_score": float(score)})
    
    context_text = "\n\n".join(context_parts)
    
    # Enhanced prompt with citation instructions
    prompt = f"""
    Context information is below. Each section is marked with [[NUMBER]] citations.
    ---------------------
    {context_text}
    ---------------------
    Given the context, answer this question: {query}
    
    Requirements:
    1. If the information isn't in the context, say "I don't have that information"
    2. For any facts used, include [[NUMBER]] citations pointing to which document they came from
    3. Include a "Sources" section at the end listing all cited documents with chunk IDs
    4. Keep the answer concise but accurate
    """
    
    # Generate response
    genai.configure(api_key="AIzaSyAYy9e5qAvCyytku6ardpMywXhRNjkfLRs")  # Replace with actual API key
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    
    # Extract generated text
    generated_text = response.candidates[0].content.parts[0].text
    
    # Add detailed sources section
    sources_section = "\n\n## Sources\n"
    cited_sources = []
    for num, metadata in source_mapping.items():
        if f"[[{num}]]" in generated_text:
            source_entry = {
                "citation": f"[[{num}]]",
                "source_filename": metadata["source"],
                "chunk_id": metadata["chunk_id"],
                "similarity_score": float(metadata["score"])
            }
            cited_sources.append(source_entry)
            sources_section += f"- [[{num}]] {metadata['source']} (Chunk ID: {metadata['chunk_id']}, Similarity Score: {metadata['score']:.2f})\n"
    
    full_response = f"Answer : {generated_text}\n{sources_section}"
    
    # Part 1: Display response in Markdown format
    display(Markdown(full_response))
    
    # Part 2: Return JSON output
    output_json = {
        "query": query,
        "retrieved_content": retrieved_content,  # Raw FAISS data
        "generated_response": generated_text
    }
    
    return json.dumps(output_json, indent=4)

In [61]:
# Define the query
import google.generativeai as genai
query = """
what percentage of all breast cancers does tnbc account for?



"""

In [62]:
final_anwer=ask(faiss_index,query,k=5)

Answer : I don't have that information.  While the provided text mentions Triple-Negative Breast Cancer (TNBC) [[4]], it does not state what percentage of all breast cancers it represents.



## Sources
- [[4]] cancers-15-00321.docx (Chunk ID: 0, Similarity Score: 1.04)


# Evaluation model

In [19]:
import os
import pytest
import asyncio
from getpass import getpass
from ragas import SingleTurnSample
from langchain_google_genai import ChatGoogleGenerativeAI
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextPrecisionWithoutReference
from pydantic import SecretStr
from ragas.metrics import LLMContextRecall
from ragas.metrics import ResponseRelevancy, FactualCorrectness, LLMContextRecall, LLMContextPrecisionWithoutReference
import time
from langchain.embeddings import HuggingFaceEmbeddings
import sentence_transformers
from ragas import SingleTurnSample, EvaluationDataset, evaluate
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_core.runnables import RunnableConfig



In [15]:
mini_all_samples=[SingleTurnSample(user_input='What defines triple negative breast cancer (TNBC) at the immunohistochemical level?', retrieved_contexts=['Triple‑negative breast cancer (TNBC) is characterized as having ≤1% cellular expression of ER and PR as determined by immunohistochemistry (IHC), and having HER2 expres-sion of 0 to 1+ by IHC, or 2+ by IHC and fluorescence in situ hybridization (FISH) negative (i.e. not an amplified gene copy number), according to American Society of Clinical Oncology/College of American Pathologists (ASCO/CAP) guidelines (4,5). TNBCs are comprised of at least four distinct transcriptional subtypes: Two basal subtypes, BL1 and BL2; a mesenchymal subtype M, which is devoid of immune cells; and a luminal androgen receptor (AR) subtype LAR (1,2). TNBC is also subdivided into 6 different subgroups based on molecular heterogeneity: Basal‑like; mesenchymal‑like; mesenchymal stem‑like; luminal AR expression; immunomodulatory; and unstable type (6). TNBC represents approximately 15‑20% of all newly diagnosed breast cancers and is generally a more aggressive disease with a poorer prognosis and higher grade than', 'subtypes of triple negative breast cancer: Understanding the', 'Triple-negative breast cancer (TNBC) accounts for approximately 15–20% of breast malignancies and is the only subtype of breast cancer that lacks targeted treatment [1]. Compared with other subtypes, TNBC is more aggressive, and most patients develop recurrence and metastasis within 3 years, with poor prognosis [2]. Anthracycline- and taxane-based chemotherapy remains the mainstay of treatment for early-stage patients, but resistance has emerged [3]; for patients with recurrence or metastasis, there are even fewer treatment options. There is an urgent need for novel and more effective treatments.'], reference_contexts=None, response='Triple-negative breast cancer (TNBC) is defined immunohistochemically by having ≤1% cellular expression of estrogen receptor (ER) and progesterone receptor (PR), and HER2 expression of 0 to 1+ by IHC, or 2+ by IHC and fluorescence in situ hybridization (FISH) negative (meaning not an amplified gene copy number), according to ASCO/CAP guidelines [[1]].\nSources:\n[[1]]: Paragraph 1\n', multi_responses=None, reference='Lack of expression (< 1%) of ER, PgR, and HER2.', rubrics=None),
 SingleTurnSample(user_input='What percentage of all breast cancers does TNBC account for?', retrieved_contexts=['The test, which helps guide treatment decisions, was not as good at predicting the risk of death from breast cancer for Black patients as for White patients, a new study has found. The ndings highlight the need for greater racial diversity in research studies.', 'TAILORx trial finds most women with early breast cancer do not benefit from chemotherapy', 'Findings from the TAILORx clinical trial show chemotherapy does not bene t most women with early breast cancer. The new data, released at the 2018 ASCO annual meeting, will help inform treatment decisions for many women with early-stage breast cancer.'], reference_contexts=None, response="I don't have that information.  The provided text discusses the TAILORx trial and disparities in breast cancer risk prediction between racial groups [[1]], but it does not provide the percentage of all breast cancers represented by triple-negative breast cancer (TNBC).\nSources:\n[[1]]:  The test, which helps guide treatment decisions, was not as good at predicting the risk of death from breast cancer for Black patients as for White patients, a new study has found. The ndings highlight the need for greater racial diversity in research studies.\n[[2]]: TAILORx trial finds most women with early breast cancer do not benefit from chemotherapy\n[[3]]: Findings from the TAILORx clinical trial show chemotherapy does not bene t most women with early-stage breast cancer. The new data, released at the 2018 ASCO annual meeting, will help inform treatment decisions for many women with early-stage breast cancer.\n", multi_responses=None, reference='Approximately 10-20%.', rubrics=None),
 SingleTurnSample(user_input='What patient population is most commonly affected by TNBC?', retrieved_contexts=['After a Breast Cancer Diagnosis, Men May Be More Likely to Die than Women', 'TAILORx trial finds most women with early breast cancer do not benefit from chemotherapy', 'Fewer Women with Ovarian, Breast Cancer Undergo Genetic Testing than Expected'], reference_contexts=None, response="I don't have that information.  The provided text focuses on breast cancer statistics and genetic testing rates, but doesn't specify the patient population most commonly affected by TNBC (Triple-Negative Breast Cancer).\n", multi_responses=None, reference='Young, premenopausal women.', rubrics=None),
 SingleTurnSample(user_input='What kind of genetic alterations are associated with TNBC?', retrieved_contexts=['Breast cancer is characterized by heterogeneity at the molecular and clinical levels. Several biomarkers including', 'Tumor mutational burden (TMB) is a measurement of the number of nonsynonymous somatic mutations in the genome of tumor cells [121]. When TMB > 10 mutations/Mb, neoantigen production becomes common to tumor cells and can be recognized by TILs [121]. High TMB has been associated with efficacy benefits for ICIs in various tumors [122,123]. Despite being the highest TMB subtype of breast cancer, TNBC still has a low mutational load compared to other tumors such as melanoma. One study showed that the median TMB in breast cancer was 2.63 mut/Mb and only 5% of patients had high TMB (>10 mut/Mb), with metastatic tumors having higher TMB. Of these, the median mutational burden in TNBC was 1.8 mut/Mb [124]. Data from 149 TNBC patients in the GeparNuevo trial showed a median TMB of 1.52 mut/Mb, and continuous TMB independently predicted pCR [125]. Data from 253 patients in the KEYNOTE-119 trial showed a positive correlation between TMB and clinical response to pembrolizumab, but not to', 'Many women diagnosed with ovarian and breast cancer are not undergoing tests for inherited genetic mutations that can provide important information to help guide decisions about treatment and longer-term cancer screening, a new study has found.'], reference_contexts=None, response='The provided text focuses on tumor mutational burden (TMB) in TNBC [[2]], noting that even though TNBC has the highest TMB among breast cancer subtypes, its mutational load is still low compared to other cancers like melanoma [[2]].  Specific details on other genetic alterations associated with TNBC are not provided.  The text also mentions that many women with breast cancer are not tested for inherited genetic mutations [[3]], but does not specify which mutations these are.\n\nSources:\n[[2]]: Section discussing Tumor Mutational Burden in TNBC\n[[3]]: Section discussing lack of testing for inherited genetic mutations in breast cancer patients\n', multi_responses=None, reference='BRCA 1/2 mutations.', rubrics=None),
 SingleTurnSample(user_input='How would you describe the biological behavior of TNBC?', retrieved_contexts=['subtypes of triple negative breast cancer: Understanding the', 'In this review, we discuss current TNBC treatments and key examples of improved clinical benefit, as well as new therapeutic strategies with which to treat the disease.', 'Cancers 2023, 15, x 2 of 29 Cancers 2023, 15, 321 2 of 28 Some studies have found that TNBC has higher PD-L1 expression and tumor-infil- trating lymphocytes (TILs) in comparison with other subtyp [7,8], making it the most Some studies have found that TNBC has higher PD-L1 ex ression and tumor-infiltrating likelyymphocytessubtype(TILs)tobenefitincomparisonfromimmunowitherapyother. subtypesHowever,[7PD,8],-(L)1makinginhibitorsthe benefitmostlikeonly'], reference_contexts=None, response='Based on the provided text, TNBC exhibits higher PD-L1 expression and tumor-infiltrating lymphocytes (TILs) compared to other breast cancer subtypes [[3]].  This makes it a potentially good candidate for immunotherapy [[3]].  Further biological details are not described in the provided text.\n\nSources:\n* [[3]]:  A section of text discussing TNBC and immunotherapy.\n', multi_responses=None, reference='Aggressive, higher grade, often lymph node involvement, inclination to metastasize.', rubrics=None)]

In [16]:
mini_all_samples

[SingleTurnSample(user_input='What defines triple negative breast cancer (TNBC) at the immunohistochemical level?', retrieved_contexts=['Triple‑negative breast cancer (TNBC) is characterized as having ≤1% cellular expression of ER and PR as determined by immunohistochemistry (IHC), and having HER2 expres-sion of 0 to 1+ by IHC, or 2+ by IHC and fluorescence in situ hybridization (FISH) negative (i.e. not an amplified gene copy number), according to American Society of Clinical Oncology/College of American Pathologists (ASCO/CAP) guidelines (4,5). TNBCs are comprised of at least four distinct transcriptional subtypes: Two basal subtypes, BL1 and BL2; a mesenchymal subtype M, which is devoid of immune cells; and a luminal androgen receptor (AR) subtype LAR (1,2). TNBC is also subdivided into 6 different subgroups based on molecular heterogeneity: Basal‑like; mesenchymal‑like; mesenchymal stem‑like; luminal AR expression; immunomodulatory; and unstable type (6). TNBC represents approximat

In [21]:
len(mini_all_samples)

5

In [32]:


# Set API tokens
os.environ["RAGAS_APP_TOKEN"] = "apt.49b2-ab9d532bcd45-e6f1-9ea8-f4708472-b3f55"
os.environ["GEMINI_API_KEY"] = 'AIzaSyAYy9e5qAvCyytku6ardpMywXhRNjkfLRs'
api_key = os.environ["GEMINI_API_KEY"]

llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0, api_key=api_key)
llm_instance = LangchainLLMWrapper(llm)

# Define metrics
metrics = [
    ResponseRelevancy(llm=llm_instance),
    FactualCorrectness(llm=llm_instance),
    LLMContextPrecisionWithoutReference(llm=llm_instance),
    LLMContextRecall(llm=llm_instance)
]

# Initialize embeddings
embeddings = LangchainEmbeddingsWrapper(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)

# Create an empty list to store results
all_results = []

# Loop through each sample and evaluate individually
for i, sample in enumerate(mini_all_samples):
    print(f"Processing sample {i + 1}/{len(mini_all_samples)}...")

    # Create evaluation dataset for a single sample
    eval_dataset = EvaluationDataset([sample])

    try:
        # Evaluate the sample
        result = evaluate(dataset=eval_dataset, embeddings=embeddings, metrics=metrics, llm=llm_instance)
        all_results.append(result)  # Store result
        
        print(f"Completed {i + 1}/{len(mini_all_samples)} | Result: {result}")
    
    except Exception as e:
        print(f"Error processing sample {i + 1}: {e}")
    
    # Respect API limits with a delay
    time.sleep(25)  # Adjust delay based on API rate limits

# Upload all results at once
if all_results:
    print("Uploading all results at once...")
    EvaluationDataset(all_results).upload()
    print("All results uploaded successfully!")
else:
    print("No results to upload.")


Processing sample 1/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Completed 1/5 | Result: {'answer_relevancy': 0.9708, 'factual_correctness(mode=f1)': 0.6000, 'llm_context_precision_without_reference': 1.0000, 'context_recall': 1.0000}
Processing sample 2/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 30
}
].
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 29
}
].
Retrying langchain_google_

KeyboardInterrupt: 

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 17
}
].
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 16
}
].
Retrying langchain_google_

In [37]:
import time

# Create an empty list to store results
all_results = []

# Define delay time for each iteration
DEFAULT_DELAY = 45 # Wait 16s before the next loop iteration
RATE_LIMIT_DELAY = 60  # If rate limited, wait 60s

# Loop through each sample and evaluate individually
for i, sample in enumerate(mini_all_samples):
    print(f"\nProcessing sample {i + 1}/{len(mini_all_samples)}...")

    eval_dataset = EvaluationDataset([sample])

    while True:
        try:
            # Evaluate the sample
            result = evaluate(dataset=eval_dataset, embeddings=embeddings, metrics=metrics, llm=llm_instance)
            all_results.append(result)  # Store result
            print(f"✅ Completed {i + 1}/{len(mini_all_samples)} | Result: {result}")

            break  # Exit retry loop on success

        except Exception as e:
            if "429" in str(e):  # Check if the error is rate limit exceeded
                print(f"⚠️ Rate limit exceeded! Retrying in {RATE_LIMIT_DELAY} seconds...")
                time.sleep(RATE_LIMIT_DELAY)  # Wait before retrying
            else:
                print(f"❌ Error processing sample {i + 1}: {e}")
                break  # Exit loop for non-rate-limit errors

    # Ensure the next loop iteration waits 16 seconds
    print(f"⏳ Waiting {DEFAULT_DELAY} seconds before processing the next sample...\n")
    time.sleep(DEFAULT_DELAY)

# Upload all results at once
if all_results:
    print("🚀 Uploading all results at once...")
    final_dataset = EvaluationDataset(all_results)
    # final_dataset.upload()  # Uncomment if the upload method exists
    print("✅ All results uploaded successfully!")
else:
    print("⚠️ No results to upload.")



Processing sample 1/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Completed 1/5 | Result: {'answer_relevancy': 0.9708, 'factual_correctness(mode=f1)': 0.7300, 'llm_context_precision_without_reference': 1.0000, 'context_recall': 1.0000}
⏳ Waiting 45 seconds before processing the next sample...


Processing sample 2/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Completed 2/5 | Result: {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000}
⏳ Waiting 45 seconds before processing the next sample...


Processing sample 3/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Completed 3/5 | Result: {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.4000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000}
⏳ Waiting 45 seconds before processing the next sample...


Processing sample 4/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Completed 4/5 | Result: {'answer_relevancy': 0.7553, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.5000, 'context_recall': 0.0000}
⏳ Waiting 45 seconds before processing the next sample...


Processing sample 5/5...


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Exception raised in Job[2]: OutputParserException(Invalid json output: ```json
{
    "reason": "The context provided is insufficient to answer the question. The context only mentions \"subtypes of triple negative breast cancer: Understanding the\", which is not enough to describe the biological behavior of TNBC. The answer relies on external sources (\[[3]]) not present in the context.",
    "verdict": 0
}
```
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE )


✅ Completed 5/5 | Result: {'answer_relevancy': 0.6346, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': nan, 'context_recall': 0.0000}
⏳ Waiting 45 seconds before processing the next sample...

🚀 Uploading all results at once...
✅ All results uploaded successfully!


In [39]:
final_dataset.upload()

AttributeError: 'EvaluationDataset' object has no attribute 'upload'

In [36]:
mini_all_samples

[SingleTurnSample(user_input='What defines triple negative breast cancer (TNBC) at the immunohistochemical level?', retrieved_contexts=['Triple‑negative breast cancer (TNBC) is characterized as having ≤1% cellular expression of ER and PR as determined by immunohistochemistry (IHC), and having HER2 expres-sion of 0 to 1+ by IHC, or 2+ by IHC and fluorescence in situ hybridization (FISH) negative (i.e. not an amplified gene copy number), according to American Society of Clinical Oncology/College of American Pathologists (ASCO/CAP) guidelines (4,5). TNBCs are comprised of at least four distinct transcriptional subtypes: Two basal subtypes, BL1 and BL2; a mesenchymal subtype M, which is devoid of immune cells; and a luminal androgen receptor (AR) subtype LAR (1,2). TNBC is also subdivided into 6 different subgroups based on molecular heterogeneity: Basal‑like; mesenchymal‑like; mesenchymal stem‑like; luminal AR expression; immunomodulatory; and unstable type (6). TNBC represents approximat

In [28]:
all_results

[{'answer_relevancy': 0.9708, 'factual_correctness(mode=f1)': 0.7300, 'llm_context_precision_without_reference': 1.0000, 'context_recall': 1.0000},
 {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000},
 {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.4000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000},
 {'answer_relevancy': 0.7553, 'factual_correctness(mode=f1)': 0.4400, 'llm_context_precision_without_reference': 0.5000, 'context_recall': 0.0000},
 {'answer_relevancy': 0.6346, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': nan, 'context_recall': 0.0000}]

In [29]:
eval_dataset = EvaluationDataset(all_results)

In [31]:
eval_dataset.upload()

AttributeError: 'EvaluationDataset' object has no attribute 'upload'

# testing type

In [41]:
import json

def load_test_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)


In [42]:
import json

def get_llm_response(test_data, faiss_index=faiss_index, k=3):
    """
    Calls the `ask` function to retrieve an LLM-generated response using FAISS.
    
    Args:
        test_data (dict): Dictionary containing the query under the key "question".
        faiss_index: FAISS index for document retrieval.
        k (int): Number of top results to retrieve (default=3).
    
    Returns:
        dict: Parsed JSON response from `ask` function.
    """
    # Extract the question from the input dictionary
    query = test_data["question"]
    
    # Call the ask function
    response_json = ask(faiss_index, query, k)
    
    # Convert JSON string to Python dictionary
    response_dict = json.loads(response_json)
    
    return response_dict  # Return the dictionary instead of a JSON string

# Example usage:
# test_data = {"question": "Where is the Eiffel Tower located?"}
# response = get_llm_response(test_data, faiss_index)
# print(response)


In [43]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
import sentence_transformers
from ragas import SingleTurnSample, EvaluationDataset, evaluate
from ragas.embeddings import LangchainEmbeddingsWrapper
 # Ensure your LLM wrapper is correctly imported
from langchain_core.runnables import RunnableConfig
from ragas.metrics import ResponseRelevancy, FactualCorrectness
from ragas.metrics import LLMContextRecall
from ragas.metrics import LLMContextPrecisionWithoutReference

# Set API token if required
os.environ["RAGAS_APP_TOKEN"] = "apt.49b2-ab9d532bcd45-e6f1-9ea8-f4708472-b3f55"
os.environ["GEMINI_API_KEY"] = 'AIzaSyAYy9e5qAvCyytku6ardpMywXhRNjkfLRs'
api_key = os.environ["GEMINI_API_KEY"]

# Load test data
test_data = load_test_data("output.json")[1] # Select first test case for now

llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', temperature=0, api_key=api_key)
llm_instance=LangchainLLMWrapper(llm)


# Get response from local RAG+LLM pipeline
responseDict = get_llm_response(test_data)
print(responseDict)
# Convert response into `SingleTurnSample` format
sample = SingleTurnSample(
    user_input=test_data["question"],
    response=responseDict["generated_response"],
    retrieved_contexts=[doc["page_content"] for doc in responseDict.get("retrieved_content", [])],
    reference=test_data["reference"]
)

# Create evaluation dataset
eval_dataset = EvaluationDataset([sample])

# Use Hugging Face Embeddings
embeddings = LangchainEmbeddingsWrapper(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)


metrics = [ResponseRelevancy(llm=llm_instance), FactualCorrectness(llm=llm_instance),LLMContextPrecisionWithoutReference(llm=llm_instance),
    LLMContextRecall(llm=llm_instance)]

run_config = RunnableConfig()

results = evaluate(dataset=eval_dataset, embeddings=embeddings, metrics=metrics,llm=llm_instance)


# Display results
print("Evaluation Results:", results)
print("Answer Relevancy Score:", results["answer_relevancy"])

# Upload results (optional)
results.upload()


Answer : I don't have that information.  The provided text focuses on the TAILORx trial's findings regarding chemotherapy benefits for early-stage breast cancer [[2, 3]] and disparities in a breast cancer risk prediction test [[1]], but it doesn't mention the percentage of breast cancers represented by TNBC (triple-negative breast cancer).


Sources:
* [[1]]: Paragraph describing disparities in breast cancer risk prediction test results.
* [[2]]: Summary of TAILORx trial findings regarding chemotherapy.
* [[3]]: Detailed summary of TAILORx trial findings and their implications for treatment decisions.



## Sources
- [[1]] Breast Cancer Research Articles - NCI.docx (Chunk ID: 0, Similarity Score: 0.94)
- [[2]] Breast Cancer Research Articles - NCI.docx (Chunk ID: 0, Similarity Score: 0.98)
- [[3]] Breast Cancer Research Articles - NCI.docx (Chunk ID: 0, Similarity Score: 1.04)


{'query': 'What percentage of all breast cancers does TNBC account for?', 'retrieved_content': [{'source_filename': 'Breast Cancer Research Articles - NCI.docx', 'chunk_id': 0, 'page_content': 'The test, which helps guide treatment decisions, was not as good at predicting the risk of death from breast cancer for Black patients as for White patients, a new study has found. The ndings highlight the need for greater racial diversity in research studies.', 'similarity_score': 0.9357395172119141}, {'source_filename': 'Breast Cancer Research Articles - NCI.docx', 'chunk_id': 0, 'page_content': 'TAILORx trial finds most women with early breast cancer do not benefit from chemotherapy', 'similarity_score': 0.9819375276565552}, {'source_filename': 'Breast Cancer Research Articles - NCI.docx', 'chunk_id': 0, 'page_content': 'Findings from the TAILORx clinical trial show chemotherapy does not bene t most women with early breast cancer. The new data, released at the 2018 ASCO annual meeting, will h

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluation Results: {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.4000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000}
Answer Relevancy Score: [np.float64(0.0)]
Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/a800b0dc-aa56-4d20-adab-db51ed72ecf9


'https://app.ragas.io/dashboard/alignment/evaluation/a800b0dc-aa56-4d20-adab-db51ed72ecf9'

In [51]:
all_results

[{'answer_relevancy': 0.9708, 'factual_correctness(mode=f1)': 0.7300, 'llm_context_precision_without_reference': 1.0000, 'context_recall': 1.0000},
 {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000},
 {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.4000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000},
 {'answer_relevancy': 0.7553, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.5000, 'context_recall': 0.0000},
 {'answer_relevancy': 0.6346, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': nan, 'context_recall': 0.0000}]

In [52]:
# Create a structured dataset with all evaluation results
final_dataset = {
    "evaluation_results": all_results  # List of dictionaries
}

print("✅ Final dataset prepared for upload!")


✅ Final dataset prepared for upload!


In [54]:
final_dataset.upload()

AttributeError: 'dict' object has no attribute 'upload'

In [56]:
import numpy as np
import pandas as pd

# Sample evaluation results
results = [
    {'answer_relevancy': 0.9708, 'factual_correctness(mode=f1)': 0.7300, 'llm_context_precision_without_reference': 1.0000, 'context_recall': 1.0000},
    {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000},
    {'answer_relevancy': 0.0000, 'factual_correctness(mode=f1)': 0.4000, 'llm_context_precision_without_reference': 0.0000, 'context_recall': 0.0000},
    {'answer_relevancy': 0.7553, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': 0.5000, 'context_recall': 0.0000},
    {'answer_relevancy': 0.6346, 'factual_correctness(mode=f1)': 0.0000, 'llm_context_precision_without_reference': np.nan, 'context_recall': 0.0000}
]

# Convert to DataFrame
df = pd.DataFrame(results)

# Compute averages, ignoring NaN values
final_scores = df.mean(skipna=True)

# Display final accuracy
print(final_scores)


answer_relevancy                           0.47214
factual_correctness(mode=f1)               0.22600
llm_context_precision_without_reference    0.37500
context_recall                             0.20000
dtype: float64


In [57]:
pip install pinecone

Collecting pineconeNote: you may need to restart the kernel to use updated packages.

  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-6.0.2-py3-none-any.whl (421 kB)
Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.2 pinecone-plugin-interface-0.0.7
