<a href="https://colab.research.google.com/github/anujushir/Ai-Bot-Projects/blob/main/Research_Paper_Summarizer_%26_Navigator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import streamlit as st
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import os
import json
from typing import List, Dict, Any
import re

# ================== STYLING ==================
st.set_page_config(page_title="Research Paper Summarizer", layout="wide")
st.markdown("""
<style>
    .section-header {
        background-color: #f0f2f6;
        padding: 10px;
        border-radius: 5px;
        margin: 10px 0px;
        font-weight: bold;
    }
    .paper-card {
        border: 1px solid #ddd;
        padding: 15px;
        margin: 10px 0px;
        border-radius: 5px;
        background-color: white;
    }
    .nav-button {
        background-color: #4CAF50;
        color: white;
        padding: 5px 10px;
        border: none;
        border-radius: 3px;
        cursor: pointer;
        margin: 2px;
    }
</style>
""", unsafe_allow_html=True)

# ================== CONSTANTS ==================
PROMPT_TEMPLATES = {
    "summary": """
You are a research paper analysis expert. Analyze the given research paper and provide a structured summary with the following sections:

1. **Title & Authors**: Extract the paper title and authors
2. **Abstract**: Key summary of the paper
3. **Key Contributions**: Main innovations or contributions
4. **Methodology**: Approach and techniques used
5. **Key Findings**: Main results and discoveries
6. **Limitations**: Any limitations mentioned
7. **Citations**: Important references cited

For each major claim, idea, or metric mentioned, note the approximate section or page number where it appears.

Paper Content: {document_content}

Structured Summary:
""",

    "section_analysis": """
Analyze this specific section from a research paper and extract:
- Main claims or hypotheses
- Key metrics and results
- Methodologies used
- Important conclusions

Section: {section_content}
Page/Section Reference: {section_ref}

Analysis:
"""
}

# ================== MODELS ==================
EMBEDDING_MODEL = OllamaEmbeddings(model="deepseek-r1:1.5b")
DOCUMENT_VECTOR_DB = InMemoryVectorStore(EMBEDDING_MODEL)
LANGUAGE_MODEL = OllamaLLM(model="deepseek-r1:1.5b")

# ================== DATA STRUCTURES ==================
class ResearchPaper:
    def __init__(self, title="", authors="", abstract="", sections=None, summary=""):
        self.title = title
        self.authors = authors
        self.abstract = abstract
        self.sections = sections if sections else {}
        self.summary = summary
        self.metadata = {}

# ================== FUNCTIONS ==================
def load_pdf_documents(file_path):
    """Load PDF documents"""
    loader = PDFPlumberLoader(file_path)
    return loader.load()

def chunk_documents(raw_documents, chunk_size=800, chunk_overlap=100):
    """Split documents into chunks with section awareness"""
    text_processor = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        add_start_index=True
    )
    return text_processor.split_documents(raw_documents)

def extract_paper_sections(text):
    """Extract common research paper sections using pattern matching"""
    sections = {}

    # Common section headers pattern
    patterns = {
        'abstract': r'abstract|summary',
        'introduction': r'introduction|^1\.?\s',
        'methodology': r'method|approach|experiment|^2\.?\s',
        'results': r'result|finding|^3\.?\s|^4\.?\s',
        'discussion': r'discussion|conclusion|^5\.?\s|^6\.?\s',
        'references': r'reference|bibliography'
    }

    lines = text.split('\n')
    current_section = 'header'
    section_content = []

    for line in lines:
        line_lower = line.lower().strip()
        section_found = False

        for section, pattern in patterns.items():
            if re.search(pattern, line_lower) and len(line) < 200:  # Avoid long lines
                if current_section in sections:
                    sections[current_section] += '\n'.join(section_content)
                else:
                    sections[current_section] = '\n'.join(section_content)

                current_section = section
                section_content = []
                section_found = True
                break

        if not section_found:
            section_content.append(line)

    # Add the last section
    if current_section in sections:
        sections[current_section] += '\n'.join(section_content)
    else:
        sections[current_section] = '\n'.join(section_content)

    return sections

def generate_structured_summary(paper_content):
    """Generate structured summary using LLM"""
    conversation_prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATES["summary"])
    response_chain = conversation_prompt | LANGUAGE_MODEL
    response = response_chain.invoke({"document_content": paper_content})
    return response

def analyze_section(section_content, section_ref):
    """Analyze a specific section"""
    conversation_prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATES["section_analysis"])
    response_chain = conversation_prompt | LANGUAGE_MODEL
    response = response_chain.invoke({
        "section_content": section_content,
        "section_ref": section_ref
    })
    return response

def index_paper_chunks(paper_chunks):
    """Index paper chunks for semantic search"""
    DOCUMENT_VECTOR_DB.add_documents(paper_chunks)

def find_relevant_sections(query):
    """Find relevant sections using semantic search"""
    return DOCUMENT_VECTOR_DB.similarity_search(query, k=3)

# ================== MOCK DATA FOR DEMONSTRATION ==================
def create_mock_papers():
    """Create mock research papers for demonstration"""
    papers = []

    # Mock paper 1
    paper1 = ResearchPaper()
    paper1.title = "Attention Is All You Need: Transformer Architecture for NLP"
    paper1.authors = "Vaswani et al."
    paper1.abstract = "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely."
    paper1.sections = {
        'abstract': "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...",
        'introduction': "Recurrent neural networks, long short-term memory and gated recurrent neural networks in particular, have been firmly established as state of the art approaches...",
        'methodology': "The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder...",
        'results': "We trained the base model for a total of 100,000 steps... The big model achieves a BLEU score of 28.4...",
        'discussion': "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention..."
    }
    paper1.summary = generate_structured_summary("\n".join(paper1.sections.values()))

    papers.append(paper1)

    return papers

# ================== MAIN APP ==================
st.title("üî¨ Research Paper Summarizer & Navigator")
st.markdown("---")

# Sidebar for paper selection and filters
st.sidebar.header("üìö Paper Selection")

# Topic input
topic = st.sidebar.text_input("Enter research topic:", placeholder="e.g., Transformers in NLP")

# Year range filter
col1, col2 = st.sidebar.columns(2)
with col1:
    min_year = st.number_input("From year", min_value=1990, max_value=2024, value=2018)
with col2:
    max_year = st.number_input("To year", min_value=1990, max_value=2024, value=2024)

# Venue filter
venue = st.sidebar.selectbox("Venue filter:", ["All", "NeurIPS", "ICML", "ICLR", "CVPR", "ACL", "EMNLP"])

# Load/Create papers
with st.spinner("Loading research papers..."):
    # In a real implementation, you would fetch papers based on topic
    # For now, using mock data
    research_papers = create_mock_papers()

    # Index papers for search
    all_chunks = []
    for i, paper in enumerate(research_papers):
        for section_name, content in paper.sections.items():
            # Create document chunks for each section
            from langchain.schema import Document
            doc = Document(
                page_content=content,
                metadata={
                    "paper_index": i,
                    "section": section_name,
                    "title": paper.title,
                    "authors": paper.authors
                }
            )
            all_chunks.append(doc)

    # Index all chunks
    index_paper_chunks(all_chunks)

st.success(f"‚úÖ Loaded {len(research_papers)} research papers!")

# Main content area
tab1, tab2, tab3 = st.tabs(["üìÑ Paper Browser", "üîç Semantic Search", "üìä Summary Dashboard"])

with tab1:
    st.header("Paper Browser")

    for i, paper in enumerate(research_papers):
        with st.expander(f"üìñ {paper.title}", expanded=i==0):
            st.markdown(f"**Authors:** {paper.authors}")

            # Section navigation
            st.markdown('<div class="section-header">üìë Quick Navigation</div>', unsafe_allow_html=True)
            cols = st.columns(6)
            sections_list = list(paper.sections.keys())

            for idx, section in enumerate(sections_list):
                with cols[idx % 6]:
                    if st.button(section.title(), key=f"nav_{i}_{section}"):
                        st.session_state[f"selected_section_{i}"] = section

            # Display selected section or abstract by default
            selected_section = st.session_state.get(f"selected_section_{i}", "abstract")
            st.markdown(f'<div class="section-header">üìñ {selected_section.upper()}</div>', unsafe_allow_html=True)
            st.text_area(f"Content", paper.sections.get(selected_section, "Content not available"), height=200, key=f"content_{i}_{selected_section}")

            # Section analysis
            if st.button(f"Analyze {selected_section}", key=f"analyze_{i}_{selected_section}"):
                with st.spinner(f"Analyzing {selected_section}..."):
                    analysis = analyze_section(
                        paper.sections.get(selected_section, ""),
                        f"Paper: {paper.title}, Section: {selected_section}"
                    )
                    st.markdown("**Section Analysis:**")
                    st.write(analysis)

with tab2:
    st.header("Semantic Search Across Papers")

    search_query = st.text_input("üîç Search for concepts, methods, or findings:")

    if search_query:
        with st.spinner("Searching across papers..."):
            relevant_docs = find_relevant_sections(search_query)

        st.markdown(f"**Found {len(relevant_docs)} relevant sections:**")

        for doc in relevant_docs:
            paper_idx = doc.metadata["paper_index"]
            paper = research_papers[paper_idx]

            st.markdown(f"""
            <div class="paper-card">
                <h4>üìÑ {paper.title}</h4>
                <p><strong>Section:</strong> {doc.metadata['section']}</p>
                <p><strong>Relevance:</strong> {doc.page_content[:200]}...</p>
            </div>
            """, unsafe_allow_html=True)

            col1, col2 = st.columns([1, 4])
            with col1:
                if st.button("View Full Section", key=f"view_{paper_idx}_{doc.metadata['section']}"):
                    st.session_state[f"selected_paper"] = paper_idx
                    st.session_state[f"selected_section_{paper_idx}"] = doc.metadata['section']
                    st.rerun()
            with col2:
                if st.button("Analyze This Context", key=f"analyze_context_{paper_idx}_{doc.metadata['section']}"):
                    with st.spinner("Analyzing context..."):
                        analysis = analyze_section(doc.page_content, f"From search: {search_query}")
                        st.markdown("**Context Analysis:**")
                        st.write(analysis)

with tab3:
    st.header("Paper Summaries Dashboard")

    for i, paper in enumerate(research_papers):
        with st.expander(f"üìä Summary: {paper.title}", expanded=i==0):
            if paper.summary:
                st.markdown(paper.summary)
            else:
                with st.spinner("Generating summary..."):
                    full_content = "\n".join(paper.sections.values())
                    summary = generate_structured_summary(full_content)
                    paper.summary = summary
                    st.markdown(summary)

# Download functionality
st.sidebar.markdown("---")
st.sidebar.header("Export Results")
if st.sidebar.button("üì• Export Summaries as JSON"):
    export_data = []
    for paper in research_papers:
        export_data.append({
            "title": paper.title,
            "authors": paper.authors,
            "summary": paper.summary,
            "sections": list(paper.sections.keys())
        })

    st.sidebar.download_button(
        label="Download JSON",
        data=json.dumps(export_data, indent=2),
        file_name="research_paper_summaries.json",
        mime="application/json"
    )

st.markdown("---")
st.markdown("*Note: This is a demonstration using mock data. In production, integrate with arXiv API, PubMed, or other academic databases.*")