In [20]:
from typing import Annotated, TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from docx import Document

# Define the state structure
class ResearchState(TypedDict):
    raw_query: str
    query: str
    scraped_articles: dict
    summarized_articles: dict
    document_path: str
    messages: Annotated[list, add_messages]
    current_step: str

In [21]:
def scrape_articles_node(state: ResearchState) -> dict:
    """Node for scraping articles from Google Scholar"""
    query = state["query"]
    
    driver = webdriver.Chrome()
    driver.get(f"https://scholar.google.com/scholar?q={query}%20ieee")

    elements = driver.find_elements(By.CSS_SELECTOR, ".gs_rt a")
    old_title = driver.title
    wait = WebDriverWait(driver, 10)

    all_docs = {}

    for i in range(len(elements)):
        ele = elements[i]
        ele.click()
        wait.until(lambda d: d.title != old_title)
        new_title = driver.title

        access = driver.find_elements(By.CLASS_NAME, "document-access-icon")

        if len(access) > 0:
            article = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "ArticlePage")))
            paras = article.find_elements(By.TAG_NAME, "p")

            doc_text = ""
            for p in paras:
                doc_text = doc_text + " " + p.text 
            
            all_docs[new_title] = doc_text
        else:
            all_docs[new_title] = "Document is locked"
            
        driver.back()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".gs_rt")))
        elements = driver.find_elements(By.CSS_SELECTOR, ".gs_rt a")
    
    driver.quit()
    
    return {
        "scraped_articles": all_docs,
        "current_step": "scraping_complete",
        "messages": [{"role": "system", "content": f"Scraped {len(all_docs)} articles"}]
    }

def summarize_articles_node(state: ResearchState) -> dict:
    """Node for summarizing scraped articles"""
    doc_dict = state["scraped_articles"]
    summarized_dict = {}

    for doc_title, doc_content in doc_dict.items():
        if doc_content != "Document is locked":
            response = requests.post(
                "http://localhost:11434/api/generate",
                json={
                    "model": "mistral",
                    "prompt": f"Summarize the following research article in 2-3 paragraphs, focusing on key findings and methodology:\n\n{doc_content}\n\nSummary:",
                    "stream": False
                }
            )
            summarized_dict[doc_title] = response.json()["response"]
        else:
            summarized_dict[doc_title] = "Article was not accessible for summarization"

    return {
        "summarized_articles": summarized_dict,
        "current_step": "summarization_complete",
        "messages": [{"role": "system", "content": f"Summarized {len(summarized_dict)} articles"}]
    }

def create_document_node(state: ResearchState) -> dict:
    """Node for creating the final Word document"""
    summarized_dict = state["summarized_articles"]
    query = state["query"]
    
    doc = Document()
    doc.add_heading(f'Research Analysis: {query}', 0)
    doc.add_paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    doc.add_paragraph("")

    for title, summary in summarized_dict.items():
        doc.add_heading(title, level=1)
        doc.add_paragraph(summary)
        doc.add_paragraph("")

    file_path = state["document_path"]
    doc.save(file_path)
    
    return {
        "document_path": file_path,
        "current_step": "document_created",
        "messages": [{"role": "system", "content": f"Document saved as {file_path}"}]
    }

def prompt_parser_node(state: ResearchState) -> str:
    """Node for extracting search query from user prompt"""
    prompt = state["raw_query"]

    research_query = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "mistral",
                "prompt": f"""Extract a concise research topic from this user prompt: "{prompt}"

                    Rules:
                    - Return only the main research topic (2-5 words)
                    - Remove filler words like "find", "search", "papers about"
                    - Focus on the core subject matter
                    - Make it suitable for academic search

                    Examples:
                    - "Find recent papers on AI agents in healthcare" → "AI agents healthcare"
                    - "I want research about machine learning for climate change" → "machine learning climate change"

                    User prompt: {prompt}
                    Research topic:""",
                "stream": False
            }
        )
    
    research_query_text = research_query.json()["response"]
    return {
        "query": research_query_text,
        "current_step": "query_extraction_complete",
        "messages": [{"role": "system", "content": f"extracted query: {research_query_text} from prompt: {prompt}"}]
    }

def should_continue_to_summarize(state: ResearchState) -> str:
    """Conditional edge function"""
    if state["current_step"] == "scraping_complete" and state["scraped_articles"]:
        return "summarize"
    return "end"

def should_continue_to_document(state: ResearchState) -> str:
    """Conditional edge function"""
    if state["current_step"] == "summarization_complete" and state["summarized_articles"]:
        return "create_document"
    return "end"

In [22]:
from datetime import datetime

def create_research_workflow():
    """Create and compile the research analysis workflow"""
    
    # Initialize the graph
    workflow = StateGraph(ResearchState)
    
    # Add nodes
    workflow.add_node("parse_prompt", prompt_parser_node)
    workflow.add_node("scrape_articles", scrape_articles_node)
    workflow.add_node("summarize_articles", summarize_articles_node)
    workflow.add_node("create_document", create_document_node)
    
    # Add edges
    workflow.add_edge(START, "parse_prompt")

    workflow.add_edge("parse_prompt", "scrape_articles")
    
    # Add conditional edges
    workflow.add_conditional_edges(
        "scrape_articles",
        should_continue_to_summarize,
        {
            "summarize": "summarize_articles",
            "end": END
        }
    )
    
    workflow.add_conditional_edges(
        "summarize_articles", 
        should_continue_to_document,
        {
            "create_document": "create_document",
            "end": END
        }
    )
    
    workflow.add_edge("create_document", END)
    
    return workflow.compile()

# Create the workflow
research_workflow = create_research_workflow()

In [25]:
def run_research_analysis(prompt: str):
    """Run the complete research analysis workflow"""
    
    initial_state = {
        "raw_query": prompt,
        "query": "",
        "scraped_articles": {},
        "summarized_articles": {},
        "document_path": r"C:\Users\yuvra\OneDrive\Desktop\ML Labs\Projects\Browser Automation\Selenium Tutorial\summarized_article.docx",
        "messages": [],
        "current_step": "starting"
    }
    
    # Execute the workflow
    final_state = research_workflow.invoke(initial_state)
    
    print(f"Analysis complete!")
    print(f"Original prompt: {final_state['raw_query']}")
    print(f"Extracted query: {final_state['query']}")
    print(f"Scraped articles: {len(final_state['scraped_articles'])}")
    print(f"Summarized articles: {len(final_state['summarized_articles'])}")
    print(f"Document saved at: {final_state['document_path']}")
    
    return final_state

In [26]:
# Run the analysis
result = run_research_analysis("Research machine learning applications in early disease detection using medical imaging")

Analysis complete!
Original prompt: Research machine learning applications in early disease detection using medical imaging
Extracted query:  Machine Learning Applications for Early Disease Detection via Medical Imaging
Scraped articles: 10
Summarized articles: 10
Document saved at: C:\Users\yuvra\OneDrive\Desktop\ML Labs\Projects\Browser Automation\Selenium Tutorial\summarized_article.docx
