In [None]:

# üî¨ Scientific Paper RAG Chatbot - Google Colab Demo
# Features: Section-Level Semantic Querying + Rank-Based Re-weighting

#  STEP 1: Install Dependencies
print("üì¶ Installing dependencies...")
!pip install faiss-cpu scikit-learn PyPDF2 langchain sentence-transformers openai -q
print("‚úÖ Installation complete!")

#  STEP 2: Upload your Scientific Paper PDF
print("\nüìÑ Please upload your scientific paper PDF")
from google.colab import files
uploaded = files.upload()

#  STEP 3: Extract text and detect sections
import PyPDF2
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Section patterns for scientific papers
SECTION_PATTERNS = [
    r'^\s*abstract\s*$', r'^\s*introduction\s*$', r'^\s*background\s*$',
    r'^\s*related\s+work\s*$', r'^\s*methodology\s*$', r'^\s*methods\s*$',
    r'^\s*results\s*$', r'^\s*discussion\s*$', r'^\s*conclusion\s*$',
    r'^\s*references\s*$', r'^\s*acknowledgments?\s*$'
]

def detect_sections(text):
    """Detect scientific paper sections"""
    lines = text.split('\n')
    sections = []
    current_section = {'name': 'Header', 'start_line': 0, 'content': ''}
    
    for i, line in enumerate(lines):
        line_lower = line.lower().strip()
        is_section_header = False
        
        for pattern in SECTION_PATTERNS:
            if re.match(pattern, line_lower, re.IGNORECASE):
                if current_section['content'].strip():
                    sections.append(current_section)
                current_section = {'name': line.strip().title(), 'start_line': i, 'content': ''}
                is_section_header = True
                break
        
        if not is_section_header:
            current_section['content'] += line + '\n'
    
    if current_section['content'].strip():
        sections.append(current_section)
    
    return sections

print("\nüîç Extracting text and detecting sections...")
pdf_text = ""
for filename in uploaded:
    reader = PyPDF2.PdfReader(open(filename, "rb"))
    for page_num, page in enumerate(reader.pages):
        page_text = page.extract_text()
        pdf_text += f"\n[PAGE {page_num + 1}]\n{page_text}"

sections = detect_sections(pdf_text)
print(f"‚úÖ Detected {len(sections)} sections:")
for sec in sections:
    print(f"   - {sec['name']}")

#  STEP 4: Chunk text with section information
print("\nüìö Chunking text with section preservation...")
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " ", ""]
)

chunked_data = []
for section in sections:
    section_chunks = splitter.split_text(section['content'])
    for chunk in section_chunks:
        if chunk.strip():
            chunked_data.append({
                'text': chunk,
                'section': section['name']
            })

print(f"‚úÖ Created {len(chunked_data)} chunks")

#  STEP 5: Create semantic embeddings with sentence-transformers
print("\nüßÆ Creating semantic embeddings...")
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
texts = [item['text'] for item in chunked_data]
embeddings = embedding_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
embeddings = embeddings.astype('float32')

# Create FAISS index with cosine similarity
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings)
index.add(embeddings)
print(f"‚úÖ Built FAISS index with {index.ntotal} vectors (dim={dimension})")

#  STEP 6: Implement rank-based re-weighting
def apply_rank_based_weighting(distances, k=5):
    """Apply exponential decay weighting based on rank"""
    ranks = np.arange(1, k + 1)
    weights = np.exp(-0.3 * (ranks - 1))
    weights = weights / weights.sum()
    return weights

def get_top_k_chunks(query, k=5, section_filter=None):
    """Retrieve top-k chunks with rank-based weighting"""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(query_embedding)
    
    if section_filter and section_filter != "All":
        # Filter by section
        filtered_indices = [i for i, item in enumerate(chunked_data) if item['section'] == section_filter]
        if not filtered_indices:
            return [], [], []
        filtered_embeddings = np.array([embeddings[i] for i in filtered_indices])
        temp_index = faiss.IndexFlatIP(filtered_embeddings.shape[1])
        temp_index.add(filtered_embeddings)
        distances, indices = temp_index.search(query_embedding, min(k, len(filtered_indices)))
        original_indices = [filtered_indices[idx] for idx in indices[0]]
        retrieved_chunks = [chunked_data[i] for i in original_indices]
    else:
        distances, indices = index.search(query_embedding, k)
        retrieved_chunks = [chunked_data[i] for i in indices[0]]
    
    weights = apply_rank_based_weighting(distances[0], k=len(retrieved_chunks))
    return retrieved_chunks, weights, distances[0]

#  STEP 7: Setup OpenAI for response generation
print("\nüîë Setting up OpenAI API...")
import os
from google.colab import userdata

# Get API key from Colab secrets or input
try:
    openai_key = userdata.get('OPENAI_API_KEY')
    print("‚úÖ Using API key from Colab secrets")
except:
    openai_key = input("Please enter your OpenAI API key: ")

os.environ['OPENAI_API_KEY'] = openai_key

import openai
openai.api_key = openai_key

def generate_response(retrieved_chunks, weights, query):
    """Generate response using rank-weighted chunks"""
    context_parts = []
    
    for i, (chunk, weight) in enumerate(zip(retrieved_chunks, weights)):
        importance = "PRIMARY" if i == 0 else "SECONDARY" if i < 3 else "SUPPORTING"
        context_parts.append(
            f"[{importance} CONTEXT - Relevance: {weight:.2%}]\n"
            f"Section: {chunk['section']}\n"
            f"Content: {chunk['text']}\n"
        )
    
    context = "\n".join(context_parts)
    prompt = f"""
You are an expert research assistant analyzing a scientific paper. Use ONLY the provided context to answer the question.

IMPORTANT:
- Pay MORE attention to PRIMARY context (highest relevance)
- Cite section names when providing information
- Provide specific, factual answers based on the paper

CONTEXT FROM SCIENTIFIC PAPER:
{context}

QUESTION: {query}

Provide a clear, well-structured answer with section citations.
"""
    
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an expert research assistant with high factual accuracy."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=800
    )
    
    return response.choices[0].message.content

#  STEP 8: Interactive querying with section filtering
print("\n" + "="*60)
print("üî¨ SCIENTIFIC PAPER RAG CHATBOT - READY!")
print("="*60)
print("\nAvailable sections:", ", ".join(set([item['section'] for item in chunked_data])))
print("\nOptions:")
print("  - Ask a question directly")
print("  - Type 'filter:SectionName' to search only in that section")
print("  - Type 'sections' to list all sections")
print("  - Type 'exit' to quit\n")

section_filter = None

while True:
    user_input = input("\nüí¨ Your input: ")
    
    if user_input.lower() == "exit":
        print("üëã Goodbye!")
        break
    
    if user_input.lower() == "sections":
        print("\nüìë Available sections:")
        for sec in set([item['section'] for item in chunked_data]):
            count = sum(1 for item in chunked_data if item['section'] == sec)
            print(f"   - {sec} ({count} chunks)")
        continue
    
    if user_input.lower().startswith("filter:"):
        section_filter = user_input[7:].strip().title()
        print(f"‚úÖ Filter set to: {section_filter}")
        continue
    
    # Process as a query
    query = user_input
    print(f"\nüîç Searching {'in section: ' + section_filter if section_filter else 'all sections'}...")
    
    retrieved_chunks, weights, distances = get_top_k_chunks(query, k=5, section_filter=section_filter)
    
    if not retrieved_chunks:
        print(f"‚ö†Ô∏è No relevant content found")
    else:
        print(f"üìä Retrieved {len(retrieved_chunks)} chunks with rank-based weighting\n")
        
        # Show retrieval details
        print("üìå Retrieved chunks:")
        for i, (chunk, weight) in enumerate(zip(retrieved_chunks, weights)):
            print(f"   {i+1}. [{chunk['section']}] - Weight: {weight:.2%}")
        
        # Generate answer
        print("\n‚è≥ Generating answer...")
        answer = generate_response(retrieved_chunks, weights, query)
        
        print("\n" + "="*60)
        print("‚úÖ ANSWER:")
        print("="*60)
        print(answer)
        print("="*60)
