<a href="https://colab.research.google.com/github/ajaysingh-codes/farmworker-health-rag/blob/main/farmworker_health_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Farmworker Health Research RAG System

**A smart search system for scientific literature on farmworker health, chemical exposures, and occupational stressors**

Built with BM25 + Semantic Search | Interactive comparison interface | Optimized for health research papers

---



## 📦 Installing Required Libraries
This cell installs all the necessary Python packages for our RAG system:
- `sentence-transformers`: For creating semantic embeddings of text
- `bm25s`: For keyword-based search (BM25 algorithm)
- `pypdf2`: For extracting text from PDF files
- `pandas`: For data manipulation
- `numpy`: For numerical operations
- `joblib`: For saving/loading embeddings
- `ipywidgets`: For creating the interactive interface

In [None]:
!pip install sentence-transformers bm25s pypdf2 pandas numpy joblib ipywidgets -q
print("Libraries installed successfully")

## Importing Libraries and Creating Workspace
This cell:
1. Imports all the libraries we'll use throughout the project
2. Creates a dedicated folder `/content/papers` for your PDF files
3. Sets up the basic environment for our RAG system

In [None]:
# Import all necessary libraries
import os
import pandas as pd
import numpy as np
import bm25s
import joblib
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown
import ipywidgets as widgets
from PyPDF2 import PdfReader
from datetime import datetime

# Create a folder for your PDFs
pdf_folder = "/content/papers"
if not os.path.exists(pdf_folder):
    os.makedirs(pdf_folder)
    print(f"✅ Created folder: {pdf_folder}")
else:
    print(f"📁 Folder already exists: {pdf_folder}")

## Upload Your Research Papers
Upload your 5 PDF papers about farmworker health:
- Click 'Choose Files' to select your PDFs
- Papers will be moved to the `papers` folder
- You'll see confirmation for each uploaded file

In [None]:
from google.colab import files

print("Click 'Choose Files' below to upload your 5 PDFs:\n")

uploaded = files.upload()

# Move uploaded files to the papers folder
for filename in uploaded.keys():
    destination = os.path.join(pdf_folder, filename)
    os.rename(filename, destination)
    print(f"✅ Uploaded: {filename}")

# Verify and list all PDFs in the folder
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
print(f"📊 Total PDFs in folder: {len(pdf_files)}")
for i, pdf in enumerate(pdf_files, 1):
    print(f"  {i}. {pdf}")

## Chunking Function with Overlap
Using 150-word chunks with 20% overlap for optimal context

In [None]:
from typing import List
import re

def get_chunks_fixed_size_with_overlap(text: str, chunk_size: int, overlap_fraction: float) -> List[str]:
  """
  Splits text into fixed-size chunks with overlap.

  Parameters:
  - text (str): The text to be split into chunks.
  - chunk_size (int): The desired size of each chunk.
  - overlap_fraction (float): The fraction of overlap between chunks (0.2 = 20% overlap)

  Returns:
  - List[str]: A list of text chunks where each chunk might overlap with its adjacent chunk.

  """
  # Split text into individual words
  text_words = text.split()

  # Calculate the number of words to overlap
  overlap_int = int(chunk_size * overlap_fraction)

  #Initialize a list to store resulting chunks
  chunks = []

  # Create chunks with overlap
  for i in range(0, len(text_words), chunk_size):
    # Include overlap from previous chunk
    chunk_words = text_words[max(i - overlap_int, 0): i + chunk_size]

    # Join words to form chunk
    chunk = " ".join(chunk_words)

    chunks.append(chunk)

  return chunks

## 🔍 Extract and Process PDFs
Extract text and create chunks from each paper

In [None]:
def extract_and_chunk_pdf(pdf_path, chunk_size=150, overlap=0.2):
    """Extract text from PDF and return chunks"""
    try:
        reader = PdfReader(pdf_path)
        full_text = ""

        # Extract all pages
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "

        # Clean text
        full_text = re.sub(r'\s+', ' ', full_text)
        full_text = re.sub(r'\n+', ' ', full_text)

        # Create chunks
        chunks = get_chunks_fixed_size_with_overlap(full_text, chunk_size, overlap)

        return chunks, len(reader.pages), True

    except Exception as e:
        print(f"❌ Error: {e}")
        return [], 0, False

# Process all PDFs and create dataset
PAPERS_DATA = []

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    paper_name = pdf_file.replace('.pdf', '')

    # Extract and chunk
    chunks, num_pages, success = extract_and_chunk_pdf(pdf_path)

    if success:
        for i, chunk in enumerate(chunks):

            PAPERS_DATA.append({
                'content': chunk, # Main text for search
                'source': paper_name, # Which paper it's from
                'id': f"{paper_name}_{i}" # Pos in paper
            })
    else:
        print(f"  ❌ Failed to process")

print(f"\n✅ Total chunks created: {len(PAPERS_DATA)}")

## Step 3: Implement Retrieval functions
Build BM25 (keyword) and Semantic Search capabilities.

In [None]:
# Create corpus for searching (combining all text for each chunk)
corpus = [x['content'] for x in PAPERS_DATA]

print(f"Corpus ready with {len(corpus)} chunks")
print(f"From {len(set([x['source'] for x in PAPERS_DATA]))} papers")

### Step 3.1: BM25 (Keyword) Search


In [None]:
# Create BM25 Retriever
BM25_RETRIEVER = bm25s.BM25(corpus=corpus)

# Tokenize the corpus
TOKENIZED_DATA = bm25s.tokenize(corpus)

# Index the tokenized data
BM25_RETRIEVER.index(TOKENIZED_DATA)

In [None]:
def bm25_retrieve(query: str, top_k: int = 5):
  """
  BM25 retrieval - keyword-based search
  """

  tokenized_query = bm25s.tokenize(query)

  # Retrieve documents
  results, scores = BM25_RETRIEVER.retrieve(tokenized_query, k=top_k)

  # Get indices
  results = results[0]
  top_k_indices = [corpus.index(result) for result in results]

  return top_k_indices

# Test BM25
test_query = "pesticide exposure health effects"
bm25_results = bm25_retrieve(test_query, top_k=3)

print(f"Query: {test_query}")
for idx in bm25_results[:3]:
  print(f"\n Source: {PAPERS_DATA[idx]['source']}")
  print(f"Preview: {PAPERS_DATA[idx]['content'][:200]}...")

### Step 3.2 : Semantic Search With Embeddings
Create embeddings for semantic understanding

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
# Using a model suitable for scientific/health content
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for all chunks
EMBEDDINGS = model.encode(corpus, show_progress_bar=True)

In [None]:
# Semantic Search Function
def semantic_search_retrieve(query, top_k=5):
  """
  Semantic search using embeddings
  """
  query_embedding = model.encode(query)

  # Calculate cosine similarity
  from sklearn.metrics.pairwise import cosine_similarity
  similarities = cosine_similarity([query_embedding], EMBEDDINGS)[0]

  # Get top-k indices
  top_k_indices = np.argsort(similarities)[-top_k:][::-1]

  return top_k_indices.tolist()

# Test semantic search
semantic_results = semantic_search_retrieve(test_query, top_k=3)

print(f"Query: {test_query}")
for idx in semantic_results[:3]:
  print(f"\n Source: {PAPERS_DATA[idx]['source']}")
  print(f"Preview: {PAPERS_DATA[idx]['content'][:200]}...")

### Step 3.3: Reciprocal Rank Fusion (RRF)
Combine BM25 and Semantic Search using RRF - to reward documents/papers that rank higher in each retrieval technique list.

In [None]:
# RRF Example
# RRF function (exactly from the example)
def reciprocal_rank_fusion(list1, list2, top_k=5, K=60):
    """
    Combine results from BM25 and Semantic Search
    """
    rrf_scores = {}

    # Calculate RRF scores
    for lst in [list1, list2]:
        for rank, item in enumerate(lst, start=1):
            if item not in rrf_scores:
                rrf_scores[item] = 0
            rrf_scores[item] += 1 / (rank + K)

    # Sort by score
    sorted_items = sorted(rrf_scores, key=rrf_scores.get, reverse=True)

    return sorted_items[:top_k]

# Test RRF
bm25_list = bm25_retrieve(test_query, top_k=5)
semantic_list = semantic_search_retrieve(test_query, top_k=5)
rrf_list = reciprocal_rank_fusion(bm25_list, semantic_list, top_k=5)

print("🔀 Reciprocal Rank Fusion Results:")
print(f"BM25 returned: {bm25_list}")
print(f"Semantic returned: {semantic_list}")
print(f"RRF combined: {rrf_list}")

In [None]:
def query_papers(indices):
  """
  Retrieve paper chunks by indices
  """
  return [PAPERS_DATA[index] for index in indices]

retrieved_papers = query_papers(rrf_list[:3])
for i, paper in enumerate(retrieved_papers, 1):
  print(f"\n{i}. Source: {paper['source']}")
  print(f"  Content: {paper['content'][:200]}...")

## Interactive Query Interface

In [None]:
# Import the generate function from utils (or define a simple one for testing)
def generate_with_single_input(prompt, **kwargs):
    """
    Placeholder for LLM generation
    In production, this would call your LLM API
    For now, returns a formatted response with the context
    """
    return {
        'content': f"Based on the retrieved information:\n\n{prompt[:500]}...\n\n[This would be the LLM's complete response]"
    }

# Function to create the final prompt (from the example)
def generate_final_prompt(query, top_k, retrieve_function=None, use_rag=True):
    """
    Generate prompt with retrieved context
    Based on the Coursera example
    """
    if not use_rag:
        return query

    # Handle RRF specially
    if retrieve_function.__name__ == 'reciprocal_rank_fusion':
        list1 = semantic_search_retrieve(query, top_k)
        list2 = bm25_retrieve(query, top_k)
        top_k_indices = retrieve_function(list1, list2, top_k)
    else:
        top_k_indices = retrieve_function(query=query, top_k=top_k)

    # Get the actual paper chunks
    relevant_chunks = query_papers(top_k_indices)

    # Format the context
    context_parts = []
    for chunk in relevant_chunks:
        context_parts.append(f"Source: {chunk['source']}\n{chunk['content'][:300]}...")

    context = "\n\n".join(context_parts)

    # Create the prompt
    prompt = f"""Answer the user query based on the following scientific paper excerpts about farmworker health.

Context from papers:
{context}

Query: {query}

Please provide a comprehensive answer based on the above context:"""

    return prompt

# Main LLM call function
def llm_call(query, retrieve_function=None, top_k=5, use_rag=True):
    """
    Call LLM with or without RAG
    """
    prompt = generate_final_prompt(query, top_k=top_k, retrieve_function=retrieve_function, use_rag=use_rag)
    generated_response = generate_with_single_input(prompt)
    return generated_response['content']

## Create the Display Widget

In [None]:
import ipywidgets as widgets
from IPython.display import display, Markdown

def display_widget(llm_call_func, semantic_search_retrieve, bm25_retrieve, reciprocal_rank_fusion):
    """
    Create interactive widget for comparing retrieval methods
    Exactly from the Coursera example
    """
    def on_button_click(b):
        query = query_input.value
        top_k = slider.value

        # Clear existing outputs
        for output in [output1, output2, output3, output4]:
            output.clear_output()
        status_output.clear_output()

        # Display "Generating..." message
        with status_output:
            print("Generating responses...")

        # Update outputs one by one
        results = [
            (output1, llm_call_func, query, True, top_k, semantic_search_retrieve),
            (output2, llm_call_func, query, True, top_k, bm25_retrieve),
            (output3, llm_call_func, query, True, top_k, reciprocal_rank_fusion),
            (output4, llm_call_func, query, False, top_k, None)
        ]

        for output, func, query, use_rag, top_k, retriever in results:
            response = func(query=query, use_rag=use_rag, top_k=top_k, retrieve_function=retriever)
            with output:
                display(Markdown(response))

        # Clear "Generating..." message
        status_output.clear_output()
        with status_output:
            print("✅ Results ready!")

    # Create UI elements
    query_input = widgets.Text(
        description='',
        placeholder='Enter your query about farmworker health...',
        layout=widgets.Layout(width='100%')
    )

    slider = widgets.IntSlider(
        value=5,
        min=1,
        max=20,
        step=1,
        description='Top K:',
        style={'description_width': 'initial'}
    )

    # Output areas with styling
    output_style = {'border': '1px solid #ccc', 'width': '100%'}
    output1 = widgets.Output(layout=output_style)
    output2 = widgets.Output(layout=output_style)
    output3 = widgets.Output(layout=output_style)
    output4 = widgets.Output(layout=output_style)
    status_output = widgets.Output()

    submit_button = widgets.Button(
        description="Get Responses",
        button_type='primary',
        style={'button_color': '#4CAF50'}
    )
    submit_button.on_click(on_button_click)

    # Labels for each method
    label1 = widgets.Label(value="🧠 Semantic Search")
    label2 = widgets.Label(value="🔤 BM25 Search")
    label3 = widgets.Label(value="🔀 Reciprocal Rank Fusion")
    label4 = widgets.Label(value="❌ Without RAG")

    # Display the interface
    display(widgets.HTML("""
    <h2>🔍 Farmworker Health Research Query System</h2>
    <p>Compare different retrieval methods for your research questions</p>
    """))

    display(query_input, slider, submit_button, status_output)

    # Create layout with 2x2 grid
    vbox1 = widgets.VBox([label1, output1], layout={'width': '48%'})
    vbox2 = widgets.VBox([label2, output2], layout={'width': '48%'})
    vbox3 = widgets.VBox([label3, output3], layout={'width': '48%'})
    vbox4 = widgets.VBox([label4, output4], layout={'width': '48%'})

    hbox_outputs1 = widgets.HBox([vbox1, vbox2], layout={'justify_content': 'space-between'})
    hbox_outputs2 = widgets.HBox([vbox3, vbox4], layout={'justify_content': 'space-between'})

    # Style the outputs
    def style_outputs(*outputs):
        for output in outputs:
            output.layout.margin = '5px'
            output.layout.height = '300px'
            output.layout.padding = '10px'
            output.layout.overflow = 'auto'

    style_outputs(output1, output2, output3, output4)

    # Display the grid
    display(hbox_outputs1)
    display(hbox_outputs2)

# Launch the widget!
print("🚀 Launching Interactive Query Interface...")
display_widget(llm_call, semantic_search_retrieve, bm25_retrieve, reciprocal_rank_fusion)