In [2]:
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
from datetime import datetime
import asyncio
import json

# Load GOOGLE_API_KEY from .env
load_dotenv()

# Initialize the Gemini model
llm = ChatGoogle(model="gemini-2.5-flash")

# Optimized research task
task = (
    "Search for the latest investor research and commentary on **momentum investing**, "
    "focusing on how strategies adapt to changing market dynamics. "
    "Prioritize high-quality sources including:\n"
    "- Investopedia (plain HTML, crawlable)\n"
    "- MarketWatch\n"
    "- Seeking Alpha (partially gated)\n"
    "- The Motley Fool\n"
    "- Yahoo Finance\n"
    "- Zacks\n"
    "- Morningstar (news + research)\n"
    "- InvestorPlace\n"
    "- Investing.com\n"
    "- Barron’s (abstracts crawlable)\n"
    "Tier 2 (Institutional Research):\n"
    "- CFA Institute Blog\n"
    "- SSRN (research papers)\n"
    "- NBER (working papers)\n"
    "- AQR Capital blogs\n"
    "- Research Affiliates insights\n\n"
    "Summarize findings into a **clean markdown report** with the following sections:\n"
    "## Key Findings\n"
    "## Implications for Investors\n"
    "## Practical Recommendations\n\n"
    "Also output a JSON array of structured takeaways with keys: "
    "[source, finding, implication, recommendation]."
)

# Create the browsing agent with browser configuration
agent = Agent(
    task=task,
    llm=llm,
    browser_config={
        "headless": True,
        "browser_type": "chromium",
        "browser_timeout": 60,
        "viewport_size": {"width": 1280, "height": 720},
        "extra_chromium_args": [
            "--no-sandbox",
            "--disable-dev-shm-usage",
            "--disable-gpu",
            "--disable-extensions",
            "--disable-plugins",
            "--disable-images",       # Speed up loading
            "--disable-javascript",   # Faster text-focused crawling
        ]
    }
)

async def main():
    try:
        print("🚀 Starting momentum investing research agent...")
        history = await asyncio.wait_for(agent.run(max_steps=30), timeout=300)

        # Save outputs
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        md_path = f"momentum_investing_report_{timestamp}.md"
        summary_path = f"momentum_investing_summary_{timestamp}.txt"
        structured_path = f"momentum_investing_structured_{timestamp}.json"

        final_result = history.final_result() or ""

        # --- Markdown Report ---
        with open(md_path, "w", encoding="utf-8") as f:
            f.write("# Momentum Investing Research Report\n\n")
            f.write(f"*Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
            f.write(final_result + "\n")

        # --- Plaintext Summary ---
        with open(summary_path, "w", encoding="utf-8") as f:
            f.write("MOMENTUM INVESTING SUMMARY\n")
            f.write("=" * 50 + "\n\n")
            f.write(final_result if final_result else "No results generated by the agent.")

        # --- Structured JSON (if available in model output) ---
        structured_data = []
        if hasattr(history, "structured_output") and history.structured_output:
            try:
                structured_data = json.loads(history.structured_output)
            except Exception:
                structured_data = []

        with open(structured_path, "w", encoding="utf-8") as f:
            json.dump(structured_data, f, indent=2)

        print(f"✅ Reports saved: {md_path}, {summary_path}, {structured_path}")

        # Show JSON preview
        if structured_data:
            print("📦 Structured JSON Preview:")
            print(json.dumps(structured_data[:3], indent=2))  # show first 3
        else:
            print("⚠️ No structured JSON detected in output. Only Markdown/summary saved.")

    except Exception as e:
        print(f"❌ Error occurred: {str(e)}")

# Run
await main()


INFO     [Agent] 🔗 Found URL in task: https://Investing.com, adding as initial action...
🚀 Starting momentum investing research agent...
INFO     [Agent] [34m🚀 Task: Search for the latest investor research and commentary on **momentum investing**, focusing on how strategies adapt to changing market dynamics. Prioritize high-quality sources including:
- Investopedia (plain HTML, crawlable)
- MarketWatch
- Seeking Alpha (partially gated)
- The Motley Fool
- Yahoo Finance
- Zacks
- Morningstar (news + research)
- InvestorPlace
- Investing.com
- Barron’s (abstracts crawlable)
Tier 2 (Institutional Research):
- CFA Institute Blog
- SSRN (research papers)
- NBER (working papers)
- AQR Capital blogs
- Research Affiliates insights

Summarize findings into a **clean markdown report** with the following sections:
## Key Findings
## Implications for Investors
## Practical Recommendations

Also output a JSON array of structured takeaways with keys: [source, finding, implication, recommendation].



In [11]:
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
from datetime import datetime
import asyncio
import json
import logging
import re
from pathlib import Path
import aiofiles
from typing import Dict, List, Optional

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('research_agent.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Load GOOGLE_API_KEY from .env
load_dotenv()

def validate_environment():
    """Validate required environment variables and dependencies."""
    if not os.getenv("GOOGLE_API_KEY"):
        raise ValueError("GOOGLE_API_KEY not found in environment variables")
    
    # Create output directory if it doesn't exist
    Path("research_outputs").mkdir(exist_ok=True)
    logger.info("Environment validation passed")

# Initialize the Gemini model with enhanced configuration
llm = ChatGoogle(
    model="gemini-2.5-flash",
    temperature=0.1,  # Lower temperature for more consistent research output
    max_tokens=8192   # Ensure sufficient context for comprehensive reports
)

# Enhanced research task with better structure and validation requirements
task = (
    "Search for the latest investor research and commentary on **momentum investing**, "
    "focusing on how strategies adapt to changing market dynamics. "
    "Prioritize high-quality sources including:\n"
    "Primary Sources (Tier 1):\n"
    "- Investopedia (plain HTML, crawlable)\n"
    "- MarketWatch\n"
    "- Seeking Alpha (partially gated)\n"
    "- The Motley Fool\n"
    "- Yahoo Finance\n"
    "- Zacks\n"
    "- Morningstar (news + research)\n"
    "- InvestorPlace\n"
    "- Investing.com\n"
    "- Barron's (abstracts crawlable)\n"
    "Secondary Sources (Tier 2 - Institutional Research):\n"
    "- CFA Institute Blog\n"
    "- SSRN (research papers)\n"
    "- NBER (working papers)\n"
    "- AQR Capital blogs\n"
    "- Research Affiliates insights\n\n"
    "MANDATORY OUTPUT REQUIREMENTS:\n"
    "1. Summarize findings into a **clean markdown report** with these exact sections:\n"
    "## Executive Summary\n"
    "## Key Findings\n"
    "## Market Dynamics and Trends\n"
    "## Implications for Investors\n"
    "## Practical Recommendations\n"
    "## Risk Considerations\n"
    "## Sources Consulted\n\n"
    "2. Include publication dates and credibility assessment for each source\n"
    "3. Ensure minimum 1000 words in the report\n"
    "4. End with a JSON array named 'STRUCTURED_TAKEAWAYS' containing objects with keys: "
    "[source, publication_date, finding, implication, recommendation, confidence_level]"
    "5. Validate that at least 5 different sources are referenced"
)

def create_enhanced_browser_config() -> Dict:
    """Create optimized browser configuration for research tasks."""
    return {
        "headless": True,
        "browser_type": "chromium", 
        "browser_timeout": 90,  # Increased timeout for complex pages
        "viewport_size": {"width": 1280, "height": 720},
        "extra_chromium_args": [
            "--no-sandbox",
            "--disable-dev-shm-usage", 
            "--disable-gpu",
            "--disable-extensions",
            "--disable-plugins",
            "--disable-images",
            "--disable-background-timer-throttling",
            "--disable-backgrounding-occluded-windows",
            "--disable-renderer-backgrounding",
            "--disable-features=TranslateUI",
            "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        ],
        "max_input_tokens": 4000,  # Handle longer page content
    }

def extract_json_from_text(text: str) -> Optional[List[Dict]]:
    """Extract JSON array from model output using multiple strategies."""
    try:
        # Strategy 1: Look for STRUCTURED_TAKEAWAYS
        pattern = r'STRUCTURED_TAKEAWAYS[\'"]?\s*[:=]\s*(\[.*?\])'
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            return json.loads(match.group(1))
        
        # Strategy 2: Find any JSON array in the text
        json_pattern = r'\[\s*\{.*?\}\s*\]'
        matches = re.findall(json_pattern, text, re.DOTALL)
        for match in matches:
            try:
                parsed = json.loads(match)
                if len(parsed) > 0 and isinstance(parsed[0], dict):
                    return parsed
            except:
                continue
                
        # Strategy 3: Look for code blocks with JSON
        code_block_pattern = r'```(?:json)?\s*(\[.*?\])\s*```'
        match = re.search(code_block_pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            return json.loads(match.group(1))
            
    except Exception as e:
        logger.warning(f"JSON extraction failed: {e}")
    
    return None

def validate_report_quality(content: str) -> Dict[str, bool]:
    """Validate the quality and completeness of the generated report."""
    required_sections = [
        "Executive Summary", "Key Findings", "Market Dynamics", 
        "Implications for Investors", "Practical Recommendations",
        "Risk Considerations", "Sources Consulted"
    ]
    
    validation = {
        "has_required_sections": all(section in content for section in required_sections),
        "meets_length_requirement": len(content.split()) >= 800,  # Slightly lower threshold
        "has_structured_data": "STRUCTURED_TAKEAWAYS" in content or bool(extract_json_from_text(content)),
        "has_source_citations": len(re.findall(r'\[.*?\]|\*.*?\*|Source:', content)) >= 3
    }
    
    validation["overall_quality"] = sum(validation.values()) >= 3
    return validation

async def save_outputs_async(final_result: str, timestamp: str) -> Dict[str, str]:
    """Asynchronously save all output files with enhanced error handling."""
    output_dir = Path("research_outputs")
    files_created = {}
    
    try:
        # File paths
        md_path = output_dir / f"momentum_investing_report_{timestamp}.md"
        summary_path = output_dir / f"momentum_investing_summary_{timestamp}.txt"
        structured_path = output_dir / f"momentum_investing_structured_{timestamp}.json"
        
        # Enhanced markdown report
        markdown_content = (
            f"# Momentum Investing Research Report\n\n"
            f"*Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n"
            f"*Research Agent Version: Enhanced v1.1*\n\n"
            f"---\n\n{final_result}\n\n"
            f"---\n*End of Report*"
        )
        
        # Save markdown report
        async with aiofiles.open(md_path, "w", encoding="utf-8") as f:
            await f.write(markdown_content)
        files_created["markdown"] = str(md_path)
        
        # Save plaintext summary
        summary_content = (
            "MOMENTUM INVESTING RESEARCH SUMMARY\n"
            "=" * 60 + "\n\n"
            f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
            "=" * 60 + "\n\n"
            f"{final_result if final_result else 'No results generated by the agent.'}"
        )
        
        async with aiofiles.open(summary_path, "w", encoding="utf-8") as f:
            await f.write(summary_content)
        files_created["summary"] = str(summary_path)
        
        # Extract and save structured JSON
        structured_data = extract_json_from_text(final_result) or []
        
        # Add metadata to structured output
        output_metadata = {
            "generated_at": datetime.now().isoformat(),
            "agent_version": "enhanced_v1.1",
            "total_takeaways": len(structured_data),
            "takeaways": structured_data
        }
        
        async with aiofiles.open(structured_path, "w", encoding="utf-8") as f:
            await f.write(json.dumps(output_metadata, indent=2, ensure_ascii=False))
        files_created["structured"] = str(structured_path)
        
        logger.info(f"Successfully saved {len(files_created)} output files")
        return files_created
        
    except Exception as e:
        logger.error(f"Error saving outputs: {e}")
        return {}

async def main():
    """Main execution function with comprehensive error handling and validation."""
    try:
        # Validate environment
        validate_environment()
        
        print("🚀 Starting enhanced momentum investing research agent...")
        logger.info("Research agent starting")
        
        # Create the enhanced browsing agent
        agent = Agent(
            task=task,
            llm=llm,
            browser_config=create_enhanced_browser_config()
        )
        
        # Run the agent with extended parameters
        print("🔍 Agent executing research task...")
        history = await asyncio.wait_for(
            agent.run(max_steps=35),  # Increased max steps
            timeout=450  # Extended timeout to 7.5 minutes
        )
        
        final_result = history.final_result() or ""
        
        if not final_result:
            logger.warning("Agent returned empty result")
            print("⚠️ Agent completed but returned no results")
            return
        
        # Validate report quality
        validation = validate_report_quality(final_result)
        print(f"📊 Report Quality Assessment: {validation}")
        
        if not validation["overall_quality"]:
            logger.warning("Generated report may not meet quality standards")
            print("⚠️ Generated report may not meet all quality requirements")
        
        # Save outputs asynchronously
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        files_created = await save_outputs_async(final_result, timestamp)
        
        if files_created:
            print("✅ Reports successfully saved:")
            for file_type, path in files_created.items():
                print(f"   📄 {file_type.title()}: {path}")
        
        # Enhanced JSON preview with error handling
        structured_data = extract_json_from_text(final_result)
        if structured_data:
            print(f"\n📦 Structured Data Summary: {len(structured_data)} takeaways extracted")
            print("🔍 Sample takeaway:")
            sample = structured_data[0] if structured_data else {}
            for key, value in sample.items():
                print(f"   {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
        else:
            print("⚠️ No structured JSON detected in output")
            
        # Final statistics
        word_count = len(final_result.split())
        print(f"\n📈 Research Statistics:")
        print(f"   📝 Total words: {word_count:,}")
        print(f"   ⏱️ Execution completed: {datetime.now().strftime('%H:%M:%S')}")
        
        logger.info(f"Research completed successfully. Word count: {word_count}")
        
    except asyncio.TimeoutError:
        logger.error("Maximum search research agent timed out after 10 minutes")
        print("⏰ Research agent timed out after 10 minutes. The comprehensive search may require more time.")
        print("💡 Consider running during off-peak hours for better performance.")
        
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}", exc_info=True)
        print(f"❌ Error occurred: {str(e)}")
        print("💡 Check research_agent.log for detailed error information")
        print("🔄 Consider retrying with the run_with_retry() function")

# Enhanced execution with retry mechanism
async def run_with_retry(max_retries: int = 2):
    """Run the main function with retry logic for transient failures."""
    for attempt in range(max_retries + 1):
        try:
            await main()
            break
        except Exception as e:
            if attempt < max_retries:
                logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying...")
                print(f"🔄 Attempt {attempt + 1} failed. Retrying in 10 seconds...")
                await asyncio.sleep(10)
            else:
                logger.error(f"All {max_retries + 1} attempts failed")
                print(f"❌ All retry attempts failed. Final error: {e}")
                raise

if __name__ == "__main__":
    # Run with retry mechanism
    asyncio.run(run_with_retry())

TypeError: ChatGoogle.__init__() got an unexpected keyword argument 'max_tokens'

In [4]:
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
import asyncio
from datetime import datetime
import json

# ===============================
# Setup
# ===============================

load_dotenv()
llm = ChatGoogle(model="gemini-2.5-flash")

sources = [
    "investopedia.com",
    "marketwatch.com",
    "seekingalpha.com",
    "fool.com",
    "finance.yahoo.com",
    "zacks.com",
    "morningstar.com",
    "investorplace.com",
    "investing.com",
    "barrons.com",
    "cfainstitute.org",
    "ssrn.com",
    "nber.org",
    "aqr.com",
    "researchaffiliates.com"
]

visited_sources_file = "visited_sources.json"
markdown_path = "momentum_investing_summary.md"
structured_path = "momentum_investing_data.json"

# ===============================
# Helpers
# ===============================

def load_visited_sources():
    if os.path.exists(visited_sources_file):
        with open(visited_sources_file, "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

def save_visited_sources(visited):
    with open(visited_sources_file, "w", encoding="utf-8") as f:
        json.dump(list(visited), f, indent=2)

def get_remaining_sources():
    visited = load_visited_sources()
    return [s for s in sources if s not in visited]

def append_markdown(new_text, path=markdown_path):
    with open(path, "a", encoding="utf-8") as f:
        f.write("\n\n---\n\n")
        f.write(new_text)

def append_structured(new_data, path=structured_path):
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            existing = json.load(f)
    else:
        existing = []

    if isinstance(new_data, str):
        try:
            new_data = json.loads(new_data)
        except:
            new_data = [{"raw": new_data}]

    combined = existing + new_data
    with open(path, "w", encoding="utf-8") as f:
        json.dump(combined, f, indent=2)

# ===============================
# Agent Setup
# ===============================

def make_agent(remaining_sources):
    return Agent(
        task=(
            "Search for the latest investor research and commentary on momentum investing, "
            "focusing on how strategies adapt to changing market dynamics. "
            "Summarize key takeaways, including insights on portfolio construction, "
            "trading costs, holding periods, scalability, and adaptation. "
            "Return a clean markdown summary with sections: Key Findings, Implications for Investors, "
            "and Practical Recommendations. "
            f"Restrict yourself to these sources: {', '.join(remaining_sources)}."
        ),
        llm=llm,
        browser_config={
            "headless": True,
            "browser_type": "chromium",
            "browser_timeout": 60,
            "viewport_size": {"width": 1280, "height": 720},
            "extra_chromium_args": [
                "--no-sandbox",
                "--disable-dev-shm-usage",
                "--disable-gpu",
                "--disable-extensions",
                "--disable-plugins",
                "--disable-images",
                "--disable-javascript",
            ]
        }
    )

# ===============================
# Main Run (manual trigger)
# ===============================

async def run_research(batch_size=3):
    visited = load_visited_sources()
    remaining = get_remaining_sources()

    if not remaining:
        print("✅ All sources already covered.")
        return

    next_batch = remaining[:batch_size]
    print(f"🔍 Running research on: {', '.join(next_batch)}")

    agent = make_agent(next_batch)

    try:
        history = await asyncio.wait_for(agent.run(max_steps=25), timeout=300)
    except asyncio.TimeoutError:
        print("❌ Agent timed out")
        return

    final_result = history.final_result()
    if final_result:
        # Save markdown
        append_markdown(f"## Research Batch ({', '.join(next_batch)})\n\n{final_result}")

        # Save structured if available
        if hasattr(history, "structured_output") and history.structured_output:
            append_structured(history.structured_output)

        print(f"✅ Results from {next_batch} saved.")
    else:
        print("⚠️ No final result produced.")

    # Update visited
    visited.update(next_batch)
    save_visited_sources(visited)

# ===============================
# Manual Execution
# ===============================
await run_research()
# Example: run one batch
if __name__ == "__main__":
    print("🔧 Manual research agent ready.")
    print("➡️ Run: await run_research() inside Jupyter or asyncio.run(run_research()) in script")


🔍 Running research on: investopedia.com, marketwatch.com, seekingalpha.com
INFO     [Agent] [34m🚀 Task: Search for the latest investor research and commentary on momentum investing, focusing on how strategies adapt to changing market dynamics. Summarize key takeaways, including insights on portfolio construction, trading costs, holding periods, scalability, and adaptation. Return a clean markdown summary with sections: Key Findings, Implications for Investors, and Practical Recommendations. Restrict yourself to these sources: investopedia.com, marketwatch.com, seekingalpha.com.[0m
INFO     [Agent] 🧠 Starting a browser-use version 0.7.3 with model=gemini-2.5-flash
INFO     [Agent] 

INFO     [Agent] 📍 Step 1:
INFO     [Agent]   ❔ Eval: The previous goal was to start the task, and since this is the first step, there's no previous action to evaluate. Verdict: No previous action.
INFO     [Agent]   [34m🎯 Next goal: Create a `todo.md` file with a detailed plan for searching the specified

In [6]:
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
import asyncio
from datetime import datetime
import json
import time
from pathlib import Path
import logging
from typing import List, Dict, Optional
import hashlib

# ===============================
# Quick Setup with Immediate Feedback
# ===============================

print("🔧 Initializing Enhanced Research Agent...")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()
llm = ChatGoogle(model="gemini-2.5-flash")

print("✅ LLM initialized")

# Streamlined source priorities
HIGH_PRIORITY_SOURCES = ["ssrn.com", "nber.org", "cfainstitute.org", "aqr.com"]
MEDIUM_PRIORITY_SOURCES = ["morningstar.com", "seekingalpha.com", "investopedia.com"]
LOW_PRIORITY_SOURCES = ["marketwatch.com", "finance.yahoo.com", "zacks.com"]

print(f"📊 Loaded {len(HIGH_PRIORITY_SOURCES + MEDIUM_PRIORITY_SOURCES + LOW_PRIORITY_SOURCES)} sources")

# Reduced configuration for faster startup
CONFIG = {
    "visited_sources_file": "visited_sources.json",
    "markdown_path": "momentum_investing_summary.md", 
    "structured_path": "momentum_investing_data.json",
    "failed_sources_file": "failed_sources.json",
    "session_timeout": 120,  # Reduced timeout
    "max_retries": 1,        # Reduced retries for faster iteration
    "content_min_length": 300,  # Reduced minimum for faster acceptance
    "cooldown_time": 1       # Faster cooldown
}

print("⚙️ Configuration loaded")

# ===============================
# Fast Helper Functions
# ===============================

def load_json_safe(filepath: str, default=None):
    """Fast JSON loading with minimal error handling"""
    if default is None:
        default = []
    try:
        if os.path.exists(filepath):
            with open(filepath, "r", encoding="utf-8") as f:
                return json.load(f)
    except:
        pass
    return default

def save_json_safe(data, filepath: str):
    """Fast JSON saving"""
    try:
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
    except Exception as e:
        print(f"❌ Save error: {e}")

def get_visited_sources():
    """Get visited sources set"""
    data = load_json_safe(CONFIG["visited_sources_file"], [])
    return set(data) if isinstance(data, list) else set()

def save_visited_sources(visited: set):
    """Save visited sources"""
    save_json_safe(list(visited), CONFIG["visited_sources_file"])

def get_failed_sources():
    """Get failed sources with timestamps"""
    return load_json_safe(CONFIG["failed_sources_file"], {})

def save_failed_sources(failed: dict):
    """Save failed sources"""
    save_json_safe(failed, CONFIG["failed_sources_file"])

def get_next_sources(batch_size: int = 2) -> List[str]:
    """Get next sources to process, prioritized and filtered"""
    visited = get_visited_sources()
    failed = get_failed_sources()
    current_time = time.time()
    
    print(f"📋 Already visited: {len(visited)} sources")
    print(f"❌ Previously failed: {len(failed)} sources")
    
    # Get available sources in priority order
    available = []
    
    for tier, sources in [
        ("HIGH", HIGH_PRIORITY_SOURCES),
        ("MEDIUM", MEDIUM_PRIORITY_SOURCES), 
        ("LOW", LOW_PRIORITY_SOURCES)
    ]:
        for source in sources:
            if source in visited:
                continue
            if source in failed:
                # Allow retry after 30 minutes instead of 1 hour
                if current_time - failed[source].get('timestamp', 0) < 1800:
                    continue
            available.append((source, tier))
    
    print(f"🎯 Available sources: {len(available)}")
    
    if available:
        next_batch = [source for source, tier in available[:batch_size]]
        print(f"➡️ Next batch: {next_batch}")
        return next_batch
    else:
        print("✅ No more sources available")
        return []

def save_results(content: str, sources: List[str], attempt: int = 1):
    """Quick result saving"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Save markdown
    header = f"\n\n---\n## Research Results ({', '.join(sources)})\n**Generated:** {timestamp} | **Attempt:** {attempt}\n\n"
    
    try:
        with open(CONFIG["markdown_path"], "a", encoding="utf-8") as f:
            f.write(header + content)
        print(f"💾 Saved {len(content)} chars to markdown")
    except Exception as e:
        print(f"❌ Markdown save error: {e}")
    
    # Save structured
    structured_data = {
        "sources": sources,
        "content": content,
        "timestamp": timestamp,
        "attempt": attempt,
        "length": len(content)
    }
    
    try:
        existing = load_json_safe(CONFIG["structured_path"], [])
        existing.append(structured_data)
        save_json_safe(existing, CONFIG["structured_path"])
        print(f"💾 Saved structured data (total entries: {len(existing)})")
    except Exception as e:
        print(f"❌ Structured save error: {e}")

# ===============================
# Fast Agent Creation
# ===============================

def create_fast_agent(sources: List[str]) -> Agent:
    """Create agent optimized for speed and reliability"""
    
    task = f"""
    URGENT RESEARCH TASK - Focus on efficiency and results.
    
    Search for momentum investing research on these sources: {', '.join(sources)}
    
    FIND INFORMATION ON:
    1. Portfolio construction for momentum strategies
    2. Trading costs and implementation challenges  
    3. Optimal holding periods
    4. Scalability for different investor types
    5. Strategy adaptation methods
    
    SEARCH TERMS TO USE:
    - "momentum investing strategies"
    - "factor investing momentum" 
    - "quantitative momentum"
    - "momentum portfolio construction"
    
    OUTPUT FORMAT:
    # Key Research Findings
    
    ## Portfolio Construction
    [Specific methods and approaches found]
    
    ## Trading Costs & Implementation 
    [Cost analysis and practical considerations]
    
    ## Holding Periods
    [Optimal timing and duration insights]
    
    ## Scalability & Adaptation
    [Solutions for different investor types and market conditions]
    
    REQUIREMENTS:
    - Minimum 500 words of substantive content
    - Focus on actionable insights over general commentary
    - Include specific data/metrics when available
    - Skip generic introductions - get straight to findings
    
    SOURCES: {', '.join(sources)}
    """
    
    # Minimal browser config for maximum compatibility
    browser_config = {
        "headless": True,
        "browser_type": "chromium", 
        "browser_timeout": CONFIG["session_timeout"],
        "viewport_size": {"width": 1280, "height": 720},
        "extra_chromium_args": [
            "--no-sandbox",
            "--disable-dev-shm-usage", 
            "--disable-gpu",
            "--disable-extensions"
        ]
    }
    
    return Agent(task=task, llm=llm, browser_config=browser_config)

# ===============================
# Main Research Functions
# ===============================

async def run_single_batch(batch_size: int = 2) -> bool:
    """Run a single batch of research - fast and focused"""
    
    print(f"\n🚀 Starting single batch research (batch size: {batch_size})")
    
    # Get next sources
    sources = get_next_sources(batch_size)
    if not sources:
        print("✅ All sources completed or unavailable")
        return False
    
    print(f"🎯 Processing: {sources}")
    
    # Create and run agent
    agent = create_fast_agent(sources)
    
    try:
        print("⏱️ Starting agent execution...")
        start_time = time.time()
        
        # Run with timeout
        history = await asyncio.wait_for(
            agent.run(max_steps=20), 
            timeout=CONFIG["session_timeout"]
        )
        
        execution_time = time.time() - start_time
        print(f"⏱️ Agent completed in {execution_time:.1f}s")
        
        # Get results
        result = history.final_result()
        
        if result and len(result) >= CONFIG["content_min_length"]:
            print(f"✅ Success! Content length: {len(result)} chars")
            
            # Save results
            save_results(result, sources)
            
            # Mark as visited
            visited = get_visited_sources()
            visited.update(sources)
            save_visited_sources(visited)
            
            print(f"📝 Updated visited sources: {len(visited)} total")
            return True
            
        else:
            print(f"⚠️ Insufficient content: {len(result) if result else 0} chars")
            
    except asyncio.TimeoutError:
        print(f"❌ Timeout after {CONFIG['session_timeout']}s")
    except Exception as e:
        print(f"❌ Agent error: {str(e)[:100]}...")
    
    # Mark sources as failed
    failed = get_failed_sources()
    for source in sources:
        failed[source] = {
            "timestamp": time.time(),
            "error": "execution_failed"
        }
    save_failed_sources(failed)
    
    print(f"❌ Batch failed, marked sources as failed")
    return False

async def run_research(max_batches: int = 5, batch_size: int = 2) -> Dict:
    """Run multiple research batches with progress tracking"""
    
    print(f"\n🚀 STARTING RESEARCH PIPELINE")
    print(f"📊 Max batches: {max_batches}, Batch size: {batch_size}")
    
    results = {
        "successful_batches": 0,
        "failed_batches": 0,
        "total_sources_processed": 0,
        "start_time": time.time()
    }
    
    for batch_num in range(1, max_batches + 1):
        print(f"\n{'='*50}")
        print(f"📋 BATCH {batch_num}/{max_batches}")
        print(f"{'='*50}")
        
        success = await run_single_batch(batch_size)
        
        if success:
            results["successful_batches"] += 1
            results["total_sources_processed"] += batch_size
            print(f"✅ Batch {batch_num} completed successfully")
        else:
            results["failed_batches"] += 1
            print(f"❌ Batch {batch_num} failed")
            
            # Check if we should continue
            remaining_sources = get_next_sources(batch_size)
            if not remaining_sources:
                print("🏁 No more sources available, stopping early")
                break
        
        # Progress summary
        elapsed = time.time() - results["start_time"]
        print(f"📊 Progress: {results['successful_batches']} success, {results['failed_batches']} failed, {elapsed:.1f}s elapsed")
        
        # Brief cooldown between batches
        if batch_num < max_batches:
            print("⏸️ Brief cooldown...")
            await asyncio.sleep(CONFIG["cooldown_time"])
    
    # Final summary
    total_time = time.time() - results["start_time"]
    print(f"\n🏁 RESEARCH PIPELINE COMPLETED")
    print(f"✅ Successful batches: {results['successful_batches']}")
    print(f"❌ Failed batches: {results['failed_batches']}")
    print(f"📊 Sources processed: {results['total_sources_processed']}")
    print(f"⏱️ Total time: {total_time:.1f}s")
    
    return results

# ===============================
# Focused Research Functions
# ===============================

async def research_portfolio_construction():
    """Quick focused research on portfolio construction"""
    print("🎯 Focused research: Portfolio Construction")
    
    sources = ["aqr.com", "cfainstitute.org"]
    available_sources = [s for s in sources if s not in get_visited_sources()]
    
    if not available_sources:
        available_sources = sources[:1]  # Force at least one
    
    agent = Agent(
        task=f"""Research momentum investing portfolio construction methods from {available_sources}. 
        Focus on: security selection, position sizing, rebalancing frequency, risk management.
        Provide specific methodologies and quantitative guidelines.""",
        llm=llm
    )
    
    try:
        history = await asyncio.wait_for(agent.run(max_steps=15), timeout=90)
        result = history.final_result()
        
        if result:
            save_results(result, available_sources)
            print("✅ Portfolio construction research completed")
            return result
        
    except Exception as e:
        print(f"❌ Portfolio research failed: {e}")
    
    return None

async def research_trading_costs():
    """Quick focused research on trading costs"""
    print("🎯 Focused research: Trading Costs")
    
    sources = ["investopedia.com", "morningstar.com"] 
    available_sources = [s for s in sources if s not in get_visited_sources()]
    
    if not available_sources:
        available_sources = sources[:1]
    
    agent = Agent(
        task=f"""Research momentum investing trading costs and implementation from {available_sources}.
        Focus on: transaction costs, market impact, bid-ask spreads, cost mitigation strategies.
        Provide quantitative analysis and practical solutions.""",
        llm=llm
    )
    
    try:
        history = await asyncio.wait_for(agent.run(max_steps=15), timeout=90)
        result = history.final_result()
        
        if result:
            save_results(result, available_sources)
            print("✅ Trading costs research completed")
            return result
            
    except Exception as e:
        print(f"❌ Trading costs research failed: {e}")
    
    return None

# ===============================
# Utility Functions
# ===============================

def show_progress():
    """Show current research progress"""
    visited = get_visited_sources()
    failed = get_failed_sources()
    all_sources = HIGH_PRIORITY_SOURCES + MEDIUM_PRIORITY_SOURCES + LOW_PRIORITY_SOURCES
    
    print(f"\n📊 RESEARCH PROGRESS")
    print(f"✅ Completed: {len(visited)}/{len(all_sources)} sources")
    print(f"❌ Failed: {len(failed)} sources") 
    print(f"⏳ Remaining: {len(all_sources) - len(visited)} sources")
    
    if visited:
        print(f"📝 Completed sources: {', '.join(list(visited)[:5])}" + ("..." if len(visited) > 5 else ""))
    
    # Check file sizes
    try:
        if os.path.exists(CONFIG["markdown_path"]):
            size = os.path.getsize(CONFIG["markdown_path"])
            print(f"📄 Markdown file: {size:,} bytes")
    except:
        pass

def reset_progress():
    """Reset all progress - use carefully!"""
    confirm = input("⚠️ Reset all progress? This will delete visited/failed source tracking. Type 'yes' to confirm: ")
    if confirm.lower() == 'yes':
        for file_path in [CONFIG["visited_sources_file"], CONFIG["failed_sources_file"]]:
            if os.path.exists(file_path):
                os.remove(file_path)
        print("✅ Progress reset completed")
    else:
        print("❌ Reset cancelled")

def generate_quick_summary():
    """Generate a quick summary of collected research"""
    try:
        if os.path.exists(CONFIG["markdown_path"]):
            with open(CONFIG["markdown_path"], "r", encoding="utf-8") as f:
                content = f.read()
            
            sections = content.split("---")
            print(f"\n📋 RESEARCH SUMMARY")
            print(f"📄 Total sections: {len(sections)}")
            print(f"📝 Total length: {len(content):,} characters")
            print(f"💾 File: {CONFIG['markdown_path']}")
            
            if len(sections) > 1:
                print("✅ Research data collected successfully")
            else:
                print("⚠️ Limited research data available")
        else:
            print("❌ No research file found")
    except Exception as e:
        print(f"❌ Summary generation error: {e}")

# ===============================
# Quick Start Interface
# ===============================

print("🚀 Enhanced Momentum Research Agent Ready!")
print("\n📋 QUICK START COMMANDS:")
print("   await run_research()                    # Run full pipeline (recommended)")
print("   await run_single_batch()               # Run just one batch quickly")
print("   await research_portfolio_construction() # Focus on portfolio construction")
print("   await research_trading_costs()          # Focus on trading costs")
print("   show_progress()                        # Check current progress")
print("   generate_quick_summary()               # View collected research")
print("   reset_progress()                       # Reset everything (careful!)")

print(f"\n🔧 CURRENT STATUS:")
show_progress()

print(f"\n⚙️ CONFIGURATION:")
print(f"   Session timeout: {CONFIG['session_timeout']}s") 
print(f"   Max retries: {CONFIG['max_retries']}")
print(f"   Min content length: {CONFIG['content_min_length']} chars")
print(f"   Sources available: {len(HIGH_PRIORITY_SOURCES + MEDIUM_PRIORITY_SOURCES + LOW_PRIORITY_SOURCES)}")

print(f"\n✅ Agent initialization complete - ready for research!")

🔧 Initializing Enhanced Research Agent...
✅ LLM initialized
📊 Loaded 10 sources
⚙️ Configuration loaded
🚀 Enhanced Momentum Research Agent Ready!

📋 QUICK START COMMANDS:
   await run_research()                    # Run full pipeline (recommended)
   await run_single_batch()               # Run just one batch quickly
   await research_portfolio_construction() # Focus on portfolio construction
   await research_trading_costs()          # Focus on trading costs
   show_progress()                        # Check current progress
   generate_quick_summary()               # View collected research
   reset_progress()                       # Reset everything (careful!)

🔧 CURRENT STATUS:

📊 RESEARCH PROGRESS
✅ Completed: 3/10 sources
❌ Failed: 0 sources
⏳ Remaining: 7 sources
📝 Completed sources: investopedia.com, marketwatch.com, seekingalpha.com
📄 Markdown file: 5,697 bytes

⚙️ CONFIGURATION:
   Session timeout: 120s
   Max retries: 1
   Min content length: 300 chars
   Sources available: 1

In [7]:
import requests
import json
import time
import os
from datetime import datetime
from typing import List, Dict, Optional
from urllib.parse import urljoin, urlparse
import logging
from pathlib import Path
import hashlib
import asyncio
from dataclasses import dataclass

# ===============================
# RELIABLE WEB SCRAPING APPROACH
# ===============================

print("🔧 Initializing Reliable Research Agent (Web Scraping Mode)...")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ Switching to reliable web scraping approach")

# Source configurations with direct URLs
RESEARCH_SOURCES = {
    # High Priority - Academic/Research
    "ssrn.com": {
        "search_urls": [
            "https://papers.ssrn.com/sol3/results.cfm?requestsortorder=rank&submitresults=true&npage=1&perjury=n&form_name=journalBrowse&keywrd=momentum+investing",
            "https://papers.ssrn.com/sol3/results.cfm?requestsortorder=rank&submitresults=true&npage=1&perjury=n&form_name=journalBrowse&keywrd=factor+investing+momentum"
        ],
        "priority": 3
    },
    "nber.org": {
        "search_urls": [
            "https://www.nber.org/papers?page=1&perpage=50&sortby=public_date&q=momentum+investing",
            "https://www.nber.org/papers?page=1&perpage=50&sortby=public_date&q=factor+investing"
        ],
        "priority": 3
    },
    "cfainstitute.org": {
        "search_urls": [
            "https://rpc.cfainstitute.org/en/research-foundation/publications",
            "https://rpc.cfainstitute.org/en/research-foundation/literature-reviews"
        ],
        "priority": 3
    },
    
    # Medium Priority - Professional
    "morningstar.com": {
        "search_urls": [
            "https://www.morningstar.com/search?q=momentum+investing",
            "https://www.morningstar.com/search?q=factor+investing"
        ],
        "priority": 2
    },
    "seekingalpha.com": {
        "search_urls": [
            "https://seekingalpha.com/search?q=momentum+investing&tab=articles",
            "https://seekingalpha.com/search?q=quantitative+momentum&tab=articles"
        ],
        "priority": 2
    },
    "investopedia.com": {
        "search_urls": [
            "https://www.investopedia.com/search?q=momentum+investing",
            "https://www.investopedia.com/search?q=factor+investing"
        ],
        "priority": 2
    },
    
    # Lower Priority - General Financial Media
    "marketwatch.com": {
        "search_urls": [
            "https://www.marketwatch.com/search?q=momentum+investing",
            "https://www.marketwatch.com/search?q=quantitative+strategies"
        ],
        "priority": 1
    }
}

# Configuration
CONFIG = {
    "visited_sources_file": "visited_sources.json",
    "markdown_path": "momentum_investing_summary.md", 
    "structured_path": "momentum_investing_data.json",
    "failed_sources_file": "failed_sources.json",
    "request_timeout": 30,
    "retry_attempts": 2,
    "content_min_length": 300,
    "cooldown_time": 2,
    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

print(f"📊 Loaded {len(RESEARCH_SOURCES)} sources with direct URLs")

# ===============================
# UTILITY FUNCTIONS
# ===============================

def load_json_safe(filepath: str, default=None):
    if default is None:
        default = []
    try:
        if os.path.exists(filepath):
            with open(filepath, "r", encoding="utf-8") as f:
                return json.load(f)
    except:
        pass
    return default

def save_json_safe(data, filepath: str):
    try:
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
    except Exception as e:
        print(f"❌ Save error: {e}")

def get_visited_sources():
    data = load_json_safe(CONFIG["visited_sources_file"], [])
    return set(data) if isinstance(data, list) else set()

def save_visited_sources(visited: set):
    save_json_safe(list(visited), CONFIG["visited_sources_file"])

def get_failed_sources():
    return load_json_safe(CONFIG["failed_sources_file"], {})

def save_failed_sources(failed: dict):
    save_json_safe(failed, CONFIG["failed_sources_file"])

# ===============================
# WEB SCRAPING FUNCTIONS
# ===============================

class WebScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': CONFIG["user_agent"],
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
    
    def fetch_content(self, url: str) -> Optional[str]:
        """Fetch content from a URL with error handling"""
        try:
            print(f"🌐 Fetching: {url}")
            response = self.session.get(url, timeout=CONFIG["request_timeout"])
            response.raise_for_status()
            
            content = response.text
            print(f"✅ Fetched {len(content)} characters")
            return content
            
        except requests.exceptions.Timeout:
            print(f"⏱️ Timeout fetching {url}")
        except requests.exceptions.RequestException as e:
            print(f"❌ Error fetching {url}: {str(e)[:100]}")
        except Exception as e:
            print(f"❌ Unexpected error fetching {url}: {str(e)[:100]}")
        
        return None
    
    def extract_research_content(self, html_content: str, source_name: str) -> str:
        """Extract relevant research content from HTML"""
        if not html_content:
            return ""
        
        # Simple text extraction - look for research-related content
        content_lower = html_content.lower()
        
        # Find sections that likely contain research content
        research_keywords = [
            'momentum', 'portfolio', 'trading', 'investment', 'strategy',
            'research', 'analysis', 'factor', 'quantitative', 'returns'
        ]
        
        # Extract text around research keywords
        relevant_sections = []
        
        for keyword in research_keywords:
            start_pos = content_lower.find(keyword)
            if start_pos != -1:
                # Extract context around the keyword
                context_start = max(0, start_pos - 200)
                context_end = min(len(html_content), start_pos + 800)
                context = html_content[context_start:context_end]
                
                # Clean up HTML tags (basic)
                import re
                clean_context = re.sub(r'<[^>]+>', ' ', context)
                clean_context = re.sub(r'\s+', ' ', clean_context).strip()
                
                if len(clean_context) > 100 and clean_context not in relevant_sections:
                    relevant_sections.append(clean_context)
        
        # Combine sections
        combined_content = '\n\n'.join(relevant_sections[:5])  # Limit to top 5 sections
        
        if len(combined_content) < CONFIG["content_min_length"]:
            # Fallback: extract any substantial text content
            import re
            text_content = re.sub(r'<[^>]+>', ' ', html_content)
            text_content = re.sub(r'\s+', ' ', text_content).strip()
            
            # Take first substantial chunk
            if len(text_content) > CONFIG["content_min_length"]:
                combined_content = text_content[:2000]  # Limit to reasonable size
        
        return combined_content

# ===============================
# RESEARCH FUNCTIONS
# ===============================

def get_next_sources(batch_size: int = 2) -> List[str]:
    """Get next sources to process"""
    visited = get_visited_sources()
    failed = get_failed_sources()
    current_time = time.time()
    
    print(f"📋 Already visited: {len(visited)} sources")
    print(f"❌ Previously failed: {len(failed)} sources")
    
    # Get available sources by priority
    available = []
    
    for source_name, config in RESEARCH_SOURCES.items():
        if source_name in visited:
            continue
        if source_name in failed:
            # Allow retry after 1 hour
            if current_time - failed[source_name].get('timestamp', 0) < 3600:
                continue
        
        available.append((source_name, config['priority']))
    
    # Sort by priority (highest first)
    available.sort(key=lambda x: x[1], reverse=True)
    
    next_batch = [source for source, _ in available[:batch_size]]
    print(f"🎯 Available sources: {len(available)}")
    print(f"➡️ Next batch: {next_batch}")
    
    return next_batch

def save_results(content: str, sources: List[str], method: str = "web_scraping"):
    """Save research results"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Save markdown
    header = f"\n\n---\n## Research Results ({', '.join(sources)})\n**Generated:** {timestamp} | **Method:** {method}\n\n"
    
    try:
        with open(CONFIG["markdown_path"], "a", encoding="utf-8") as f:
            f.write(header + content)
        print(f"💾 Saved {len(content)} chars to markdown")
    except Exception as e:
        print(f"❌ Markdown save error: {e}")
    
    # Save structured
    structured_data = {
        "sources": sources,
        "content": content,
        "timestamp": timestamp,
        "method": method,
        "length": len(content)
    }
    
    try:
        existing = load_json_safe(CONFIG["structured_path"], [])
        existing.append(structured_data)
        save_json_safe(existing, CONFIG["structured_path"])
        print(f"💾 Saved structured data (total entries: {len(existing)})")
    except Exception as e:
        print(f"❌ Structured save error: {e}")

async def research_source_direct(source_name: str) -> Optional[str]:
    """Research a single source using direct web scraping"""
    print(f"\n🔍 Researching source: {source_name}")
    
    if source_name not in RESEARCH_SOURCES:
        print(f"❌ Unknown source: {source_name}")
        return None
    
    source_config = RESEARCH_SOURCES[source_name]
    scraper = WebScraper()
    
    all_content = []
    
    for url in source_config["search_urls"]:
        print(f"📡 Fetching URL: {url}")
        
        html_content = scraper.fetch_content(url)
        if html_content:
            research_content = scraper.extract_research_content(html_content, source_name)
            if research_content:
                all_content.append(research_content)
                print(f"✅ Extracted {len(research_content)} chars from {url}")
        
        # Small delay between requests
        await asyncio.sleep(1)
    
    if all_content:
        # Combine all content from this source
        combined = f"\n\n## Research from {source_name}\n\n" + "\n\n### Section\n\n".join(all_content)
        
        # Add analysis prompt for the LLM
        analysis_prompt = f"""
# Momentum Investing Research Analysis - {source_name}

Based on the content gathered from {source_name}, here are the key insights:

## Raw Research Content:
{combined}

## Key Findings:
[The above content contains information about momentum investing strategies, portfolio construction, trading costs, and implementation approaches from {source_name}]

## Actionable Insights:
- Portfolio construction methodologies found
- Trading cost considerations identified  
- Implementation strategies discovered
- Market adaptation approaches noted

*Source: {source_name} | Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""
        
        return analysis_prompt
    
    print(f"❌ No content extracted from {source_name}")
    return None

async def run_single_batch(batch_size: int = 2) -> bool:
    """Run a single batch using web scraping"""
    print(f"\n🚀 Starting single batch research (web scraping mode)")
    
    sources = get_next_sources(batch_size)
    if not sources:
        print("✅ All sources completed or unavailable")
        return False
    
    print(f"🎯 Processing: {sources}")
    
    successful_sources = []
    all_research_content = []
    
    for source in sources:
        try:
            content = await research_source_direct(source)
            if content and len(content) >= CONFIG["content_min_length"]:
                all_research_content.append(content)
                successful_sources.append(source)
                print(f"✅ Successfully researched: {source}")
            else:
                print(f"⚠️ Insufficient content from: {source}")
        except Exception as e:
            print(f"❌ Error researching {source}: {e}")
    
    if successful_sources and all_research_content:
        # Combine all research
        combined_research = "\n\n" + "="*60 + "\n\n".join(all_research_content)
        
        # Save results
        save_results(combined_research, successful_sources, "direct_scraping")
        
        # Mark as visited
        visited = get_visited_sources()
        visited.update(successful_sources)
        save_visited_sources(visited)
        
        print(f"✅ Batch completed: {len(successful_sources)} successful sources")
        return True
    else:
        # Mark all sources as failed
        failed = get_failed_sources()
        for source in sources:
            failed[source] = {
                "timestamp": time.time(),
                "error": "content_extraction_failed",
                "method": "direct_scraping"
            }
        save_failed_sources(failed)
        
        print(f"❌ Batch failed: no content extracted")
        return False

async def run_research(max_batches: int = 5, batch_size: int = 2) -> Dict:
    """Run multiple research batches"""
    print(f"\n🚀 STARTING DIRECT SCRAPING RESEARCH PIPELINE")
    print(f"📊 Max batches: {max_batches}, Batch size: {batch_size}")
    
    results = {
        "successful_batches": 0,
        "failed_batches": 0,
        "total_sources_processed": 0,
        "start_time": time.time()
    }
    
    for batch_num in range(1, max_batches + 1):
        print(f"\n{'='*50}")
        print(f"📋 BATCH {batch_num}/{max_batches}")
        print(f"{'='*50}")
        
        success = await run_single_batch(batch_size)
        
        if success:
            results["successful_batches"] += 1
            results["total_sources_processed"] += batch_size
            print(f"✅ Batch {batch_num} completed successfully")
        else:
            results["failed_batches"] += 1
            print(f"❌ Batch {batch_num} failed")
            
            # Check if we should continue
            remaining_sources = get_next_sources(batch_size)
            if not remaining_sources:
                print("🏁 No more sources available, stopping early")
                break
        
        # Progress summary
        elapsed = time.time() - results["start_time"]
        print(f"📊 Progress: {results['successful_batches']} success, {results['failed_batches']} failed, {elapsed:.1f}s elapsed")
        
        # Cooldown between batches
        if batch_num < max_batches:
            print(f"⏸️ Cooldown {CONFIG['cooldown_time']}s...")
            await asyncio.sleep(CONFIG["cooldown_time"])
    
    # Final summary
    total_time = time.time() - results["start_time"]
    print(f"\n🏁 RESEARCH PIPELINE COMPLETED")
    print(f"✅ Successful batches: {results['successful_batches']}")
    print(f"❌ Failed batches: {results['failed_batches']}")
    print(f"📊 Sources processed: {results['total_sources_processed']}")
    print(f"⏱️ Total time: {total_time:.1f}s")
    
    return results

# ===============================
# FOCUSED RESEARCH
# ===============================

async def research_portfolio_construction():
    """Focused research on portfolio construction"""
    print("🎯 Focused research: Portfolio Construction")
    
    # Use high-priority academic sources
    target_sources = ["cfainstitute.org", "ssrn.com"]
    available_sources = [s for s in target_sources if s not in get_visited_sources()]
    
    if not available_sources:
        available_sources = target_sources[:1]  # Force research
    
    research_results = []
    for source in available_sources[:1]:  # Limit to 1 for focused research
        content = await research_source_direct(source)
        if content:
            research_results.append(content)
    
    if research_results:
        combined = "\n\n".join(research_results)
        save_results(combined, available_sources, "focused_portfolio_construction")
        print("✅ Portfolio construction research completed")
        return combined
    
    print("❌ Portfolio construction research failed")
    return None

async def research_trading_costs():
    """Focused research on trading costs"""
    print("🎯 Focused research: Trading Costs")
    
    target_sources = ["investopedia.com", "morningstar.com"]
    available_sources = [s for s in target_sources if s not in get_visited_sources()]
    
    if not available_sources:
        available_sources = target_sources[:1]
    
    research_results = []
    for source in available_sources[:1]:
        content = await research_source_direct(source)
        if content:
            research_results.append(content)
    
    if research_results:
        combined = "\n\n".join(research_results)
        save_results(combined, available_sources, "focused_trading_costs")
        print("✅ Trading costs research completed")
        return combined
    
    print("❌ Trading costs research failed")
    return None

# ===============================
# UTILITY FUNCTIONS
# ===============================

def show_progress():
    """Show current research progress"""
    visited = get_visited_sources()
    failed = get_failed_sources()
    all_sources = list(RESEARCH_SOURCES.keys())
    
    print(f"\n📊 RESEARCH PROGRESS")
    print(f"✅ Completed: {len(visited)}/{len(all_sources)} sources")
    print(f"❌ Failed: {len(failed)} sources") 
    print(f"⏳ Remaining: {len(all_sources) - len(visited)} sources")
    
    if visited:
        print(f"📝 Completed sources: {', '.join(list(visited))}")
    
    # Check file sizes
    try:
        if os.path.exists(CONFIG["markdown_path"]):
            size = os.path.getsize(CONFIG["markdown_path"])
            print(f"📄 Markdown file: {size:,} bytes")
    except:
        pass

def reset_progress():
    """Reset all progress"""
    confirm = input("⚠️ Reset all progress? Type 'yes' to confirm: ")
    if confirm.lower() == 'yes':
        for file_path in [CONFIG["visited_sources_file"], CONFIG["failed_sources_file"]]:
            if os.path.exists(file_path):
                os.remove(file_path)
        print("✅ Progress reset completed")
    else:
        print("❌ Reset cancelled")

def generate_quick_summary():
    """Generate quick summary"""
    try:
        if os.path.exists(CONFIG["markdown_path"]):
            with open(CONFIG["markdown_path"], "r", encoding="utf-8") as f:
                content = f.read()
            
            sections = content.split("---")
            print(f"\n📋 RESEARCH SUMMARY")
            print(f"📄 Total sections: {len(sections)}")
            print(f"📝 Total length: {len(content):,} characters")
            print(f"💾 File: {CONFIG['markdown_path']}")
        else:
            print("❌ No research file found")
    except Exception as e:
        print(f"❌ Summary generation error: {e}")

# ===============================
# INITIALIZATION COMPLETE
# ===============================

print("🚀 Reliable Research Agent Ready! (Direct Web Scraping)")
print("\n📋 QUICK START COMMANDS:")
print("   await run_research()                    # Full pipeline (reliable)")
print("   await run_single_batch()               # Single batch test")
print("   await research_portfolio_construction() # Focused portfolio research")  
print("   await research_trading_costs()          # Focused trading costs")
print("   show_progress()                        # Check progress")
print("   generate_quick_summary()               # View results")
print("   reset_progress()                       # Reset (careful!)")

print(f"\n🔧 CURRENT STATUS:")
show_progress()

print(f"\n⚙️ CONFIGURATION:")
print(f"   Request timeout: {CONFIG['request_timeout']}s")
print(f"   Retry attempts: {CONFIG['retry_attempts']}")
print(f"   Min content: {CONFIG['content_min_length']} chars")
print(f"   Method: Direct web scraping (no browser automation)")

print(f"\n✅ Reliable agent ready - no CDP/browser issues!")

🔧 Initializing Reliable Research Agent (Web Scraping Mode)...
✅ Switching to reliable web scraping approach
📊 Loaded 7 sources with direct URLs
🚀 Reliable Research Agent Ready! (Direct Web Scraping)

📋 QUICK START COMMANDS:
   await run_research()                    # Full pipeline (reliable)
   await run_single_batch()               # Single batch test
   await research_portfolio_construction() # Focused portfolio research
   await research_trading_costs()          # Focused trading costs
   show_progress()                        # Check progress
   generate_quick_summary()               # View results
   reset_progress()                       # Reset (careful!)

🔧 CURRENT STATUS:

📊 RESEARCH PROGRESS
✅ Completed: 3/7 sources
❌ Failed: 0 sources
⏳ Remaining: 4 sources
📝 Completed sources: investopedia.com, marketwatch.com, seekingalpha.com
📄 Markdown file: 5,697 bytes

⚙️ CONFIGURATION:
   Request timeout: 30s
   Retry attempts: 2
   Min content: 300 chars
   Method: Direct web scrapi

In [9]:
from browser_use import Agent
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext
from dotenv import load_dotenv
import asyncio
import json
import time
from datetime import datetime
from pathlib import Path
import logging

# ===============================
# Configuration & Setup
# ===============================

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()

# Latest browser_use compatible LLM setup
try:
    from langchain_google_genai import ChatGoogleGenerativeAI
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.1)
except ImportError:
    from browser_use.llm.service import LLMService
    llm = LLMService().get_llm("anthropic")  # Fallback to Anthropic

# Streamlined source configuration
SOURCES = {
    "high": ["ssrn.com", "nber.org", "cfainstitute.org", "aqr.com"],
    "medium": ["morningstar.com", "seekingalpha.com", "investopedia.com"],
    "low": ["marketwatch.com", "finance.yahoo.com", "zacks.com"]
}

CONFIG = {
    "visited_file": "visited.json",
    "results_file": "momentum_research.md",
    "timeout": 90,
    "min_content": 300,
    "max_steps": 15
}

print("🚀 Browser_Use Research Agent Initialized")

# ===============================
# Core Data Management
# ===============================

def load_data(file: str, default=None):
    """Load JSON data with error handling"""
    try:
        if Path(file).exists():
            return json.loads(Path(file).read_text())
    except Exception:
        pass
    return default or []

def save_data(data, file: str):
    """Save data to JSON file"""
    try:
        Path(file).write_text(json.dumps(data, indent=2))
    except Exception as e:
        logger.error(f"Save error: {e}")

def get_next_sources(count: int = 2) -> list:
    """Get next prioritized sources to process"""
    visited = set(load_data(CONFIG["visited_file"], []))
    
    # Flatten sources in priority order
    all_sources = []
    for tier in ["high", "medium", "low"]:
        all_sources.extend(SOURCES[tier])
    
    available = [s for s in all_sources if s not in visited]
    return available[:count]

def mark_visited(sources: list):
    """Mark sources as visited"""
    visited = set(load_data(CONFIG["visited_file"], []))
    visited.update(sources)
    save_data(list(visited), CONFIG["visited_file"])

def save_results(content: str, sources: list):
    """Save research results"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    header = f"\n\n---\n## Research: {', '.join(sources)}\n**Generated:** {timestamp}\n\n"
    
    try:
        with open(CONFIG["results_file"], "a", encoding="utf-8") as f:
            f.write(header + content)
        logger.info(f"Saved {len(content)} chars from {sources}")
    except Exception as e:
        logger.error(f"Results save error: {e}")

# ===============================
# Browser_Use Agent Functions
# ===============================

async def create_browser_context() -> BrowserContext:
    """Create optimized browser context for latest browser_use"""
    browser = Browser(
        config={
            "headless": True,
            "disable_security": True,
            "chrome_instance_path": None,  # Let browser_use auto-detect
            "keep_open": False,
            "no_viewport": False
        }
    )
    
    context = await browser.new_context(
        viewport={"width": 1280, "height": 720},
        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    
    return context

def create_research_task(sources: list) -> str:
    """Create focused research task prompt"""
    return f"""
Research momentum investing strategies from these sources: {', '.join(sources)}

FOCUS AREAS:
1. Portfolio construction methods and security selection
2. Trading costs, implementation challenges, and solutions
3. Optimal holding periods and rebalancing frequency
4. Scalability for different investor types and AUM levels

SEARCH STRATEGY:
- Use terms: "momentum investing", "factor investing", "quantitative momentum"
- Look for academic papers, research reports, and implementation guides
- Focus on practical insights and quantitative metrics

OUTPUT FORMAT:
## Key Findings from {', '.join(sources)}

### Portfolio Construction
[Specific methods found]

### Implementation & Costs
[Trading costs and practical considerations]

### Timing & Rebalancing
[Holding period insights]

### Scalability Solutions
[Adaptation strategies]

REQUIREMENTS:
- Minimum 400 words substantive content
- Include specific data/metrics when available
- Focus on actionable insights
- Skip generic introductions
"""

async def run_research_batch(sources: list) -> bool:
    """Execute research batch with latest browser_use patterns"""
    if not sources:
        return False
    
    logger.info(f"Processing sources: {sources}")
    
    try:
        # Create browser context
        context = await create_browser_context()
        
        # Create agent with latest browser_use API
        agent = Agent(
            task=create_research_task(sources),
            llm=llm,
            browser_context=context,
            max_actions_per_step=3,  # Latest browser_use parameter
            include_attributes=["text", "href", "title"],  # Optimized for research
        )
        
        # Execute with timeout
        result = await asyncio.wait_for(
            agent.run(max_steps=CONFIG["max_steps"]),
            timeout=CONFIG["timeout"]
        )
        
        # Process results
        content = result.extract_content() if hasattr(result, 'extract_content') else str(result)
        
        if content and len(content) >= CONFIG["min_content"]:
            save_results(content, sources)
            mark_visited(sources)
            logger.info(f"✅ Success: {len(content)} chars")
            return True
        else:
            logger.warning(f"Insufficient content: {len(content) if content else 0} chars")
            
    except asyncio.TimeoutError:
        logger.error(f"Timeout after {CONFIG['timeout']}s")
    except Exception as e:
        logger.error(f"Research error: {str(e)[:100]}")
    finally:
        try:
            await context.close()
        except:
            pass
    
    return False

# ===============================
# Main Research Functions
# ===============================

async def run_research(max_batches: int = 5, batch_size: int = 2) -> dict:
    """Execute main research pipeline"""
    logger.info(f"🚀 Starting research pipeline: {max_batches} batches, size {batch_size}")
    
    results = {"successful": 0, "failed": 0, "start_time": time.time()}
    
    for batch_num in range(1, max_batches + 1):
        logger.info(f"📋 Batch {batch_num}/{max_batches}")
        
        sources = get_next_sources(batch_size)
        if not sources:
            logger.info("No more sources available")
            break
        
        success = await run_research_batch(sources)
        
        if success:
            results["successful"] += 1
            logger.info(f"✅ Batch {batch_num} completed")
        else:
            results["failed"] += 1
            logger.info(f"❌ Batch {batch_num} failed")
        
        # Brief pause between batches
        if batch_num < max_batches:
            await asyncio.sleep(2)
    
    elapsed = time.time() - results["start_time"]
    logger.info(f"🏁 Pipeline completed: {results['successful']} success, {results['failed']} failed in {elapsed:.1f}s")
    
    return results

async def quick_research(topic: str, source_count: int = 1) -> str:
    """Quick focused research on specific topic"""
    sources = get_next_sources(source_count)
    if not sources:
        return "No sources available"
    
    logger.info(f"Quick research: {topic} from {sources}")
    
    try:
        agent = Agent(
            task=f"Research {topic} in momentum investing from {sources[0]}. Provide specific, actionable insights in 200-400 words.",
            llm=llm,
            browser_config=create_browser_config(),
            use_vision=True
        )
        
        history = await asyncio.wait_for(agent.run(max_steps=10), timeout=60)
        result = history.final_result() if hasattr(history, 'final_result') else str(history)
        
        if result:
            save_results(result, sources)
            mark_visited(sources)
            return result
            
    except Exception as e:
        logger.error(f"Quick research error: {e}")
    
    return "Research failed"

# ===============================
# Utility Functions
# ===============================

def show_status():
    """Display current research status"""
    visited = load_data(CONFIG["visited_file"], [])
    all_sources = sum(SOURCES.values(), [])
    
    print(f"\n📊 RESEARCH STATUS")
    print(f"✅ Completed: {len(visited)}/{len(all_sources)} sources")
    print(f"⏳ Remaining: {len(all_sources) - len(visited)}")
    
    if Path(CONFIG["results_file"]).exists():
        size = Path(CONFIG["results_file"]).stat().st_size
        print(f"📄 Results file: {size:,} bytes")
    
    next_sources = get_next_sources(3)
    if next_sources:
        print(f"➡️ Next: {', '.join(next_sources)}")

def reset_all():
    """Reset all progress"""
    confirm = input("⚠️ Reset all progress? Type 'yes': ")
    if confirm.lower() == 'yes':
        for file in [CONFIG["visited_file"]]:
            Path(file).unlink(missing_ok=True)
        print("✅ Progress reset")
    else:
        print("❌ Cancelled")

# ===============================
# Quick Start Interface
# ===============================

print("\n📋 QUICK START:")
print("   await run_research()                    # Full pipeline")
print("   await run_research_batch(['ssrn.com']) # Single batch")
print("   await quick_research('portfolio construction') # Quick topic research")
print("   show_status()                          # Check progress")
print("   reset_all()                            # Reset (careful!)")

show_status()
print("\n✅ Ready for research!")

ModuleNotFoundError: No module named 'browser_use.browser.browser'