In [6]:
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
import asyncio
import json
import feedparser
from datetime import datetime
import urllib.parse
import time

# ===============================
# Setup
# ===============================

load_dotenv()
llm = ChatGoogle(model="gemini-2.5-flash")

# ArXiv search queries (momentum investing / trading)
queries = [
    "momentum trading",
    "momentum investing", 
    "momentum transformer",
    "network momentum",
    "intraday momentum strategy"
]

visited_papers_file = "visited_arxiv.json"
markdown_path = "arxiv_momentum_summary.md"
structured_path = "arxiv_momentum_data.json"

# ===============================
# Helpers
# ===============================

def load_visited_papers():
    if os.path.exists(visited_papers_file):
        with open(visited_papers_file, "r", encoding="utf-8") as f:
            return set(json.load(f))
    return set()

def save_visited_papers(visited):
    with open(visited_papers_file, "w", encoding="utf-8") as f:
        json.dump(list(visited), f, indent=2)

def append_markdown(new_text, path=markdown_path):
    with open(path, "a", encoding="utf-8") as f:
        f.write("\n\n---\n\n")
        f.write(new_text)

def append_structured(new_data, path=structured_path):
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            existing = json.load(f)
    else:
        existing = []

    if isinstance(new_data, str):
        try:
            new_data = json.loads(new_data)
        except json.JSONDecodeError:
            new_data = [{"raw": new_data}]

    combined = existing + new_data
    with open(path, "w", encoding="utf-8") as f:
        json.dump(combined, f, indent=2)

# ===============================
# Fetch from arXiv API (FIXED)
# ===============================

def fetch_arxiv(query, max_results=5, retries=3):
    """Fetch papers from arXiv API with proper URL encoding and error handling"""
    
    # URL encode the query to handle spaces and special characters
    encoded_query = urllib.parse.quote(query)
    url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results={max_results}"
    
    for attempt in range(retries):
        try:
            print(f"Fetching arXiv papers for query: '{query}' (attempt {attempt + 1})")
            feed = feedparser.parse(url)
            
            # Check if the feed was parsed successfully
            if hasattr(feed, 'bozo') and feed.bozo:
                print(f"Warning: Feed parser encountered an issue: {feed.bozo_exception}")
            
            papers = []
            for entry in feed.entries:
                # Extract arXiv ID more robustly
                arxiv_id = entry.id.split('/')[-1] if hasattr(entry, 'id') else f"unknown_{len(papers)}"
                
                papers.append({
                    "id": arxiv_id,
                    "title": entry.title.replace('\n', ' ').strip() if hasattr(entry, 'title') else "No title",
                    "authors": [a.name for a in entry.authors] if hasattr(entry, 'authors') else [],
                    "year": entry.published[:4] if hasattr(entry, 'published') else "Unknown",
                    "url": entry.link if hasattr(entry, 'link') else "",
                    "abstract": entry.summary.replace('\n', ' ').strip() if hasattr(entry, 'summary') else "",
                    "source": "arXiv",
                    "query_term": query
                })
            
            print(f"Successfully fetched {len(papers)} papers for '{query}'")
            return papers
            
        except Exception as e:
            print(f"Error fetching arXiv papers (attempt {attempt + 1}): {str(e)}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                print(f"Failed to fetch papers for query '{query}' after {retries} attempts")
                return []
    
    return []

# ===============================
# Agent Setup (IMPROVED)
# ===============================

def make_agent(paper_batch):
    """Create agent with better task instructions"""
    task_prompt = f"""
Analyze these {len(paper_batch)} academic papers on momentum trading/investing:

{json.dumps(paper_batch, indent=2)}

Please provide:

1. **Markdown Summary** for each paper with:
   - Title and Authors
   - Key Findings (3-4 bullet points)
   - Implications for Investors 
   - Practical Recommendations

2. **Structured JSON** output with this format:
```json
[
  {{
    "title": "paper title",
    "authors": ["author1", "author2"],
    "year": "2024",
    "url": "arxiv_url",
    "key_findings": ["finding1", "finding2", "finding3"],
    "implications": ["implication1", "implication2"],
    "recommendations": ["rec1", "rec2"],
    "relevance_score": 8.5,
    "summary": "one paragraph summary"
  }}
]
```

Focus on practical trading insights and quantitative results where available.
"""
    
    return Agent(
        task=task_prompt,
        llm=llm,
        browser_config={
            "headless": True,
            "viewport_size": {"width": 1280, "height": 720}
        }
    )

# ===============================
# Progress Tracking
# ===============================

def log_progress(message, level="INFO"):
    """Simple progress logging"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {level}: {message}")

# ===============================
# Main Run (IMPROVED)
# ===============================

async def run_research(batch_size=3, max_papers_per_query=5):
    """Enhanced research runner with better error handling"""
    
    log_progress("Starting arXiv research...")
    visited = load_visited_papers()
    all_new_papers = []

    # Fetch papers from all queries
    for i, query in enumerate(queries, 1):
        log_progress(f"Processing query {i}/{len(queries)}: '{query}'")
        
        try:
            papers = fetch_arxiv(query, max_results=max_papers_per_query)
            new_papers = [p for p in papers if p["id"] not in visited]
            
            log_progress(f"Found {len(papers)} total papers, {len(new_papers)} new ones")
            all_new_papers.extend(new_papers)
            
            # Rate limiting - be nice to arXiv
            if i < len(queries):
                time.sleep(1)
                
        except Exception as e:
            log_progress(f"Error processing query '{query}': {str(e)}", "ERROR")
            continue

    if not all_new_papers:
        log_progress("No new papers found. All papers already processed.")
        return

    # Process papers in batches
    for batch_start in range(0, len(all_new_papers), batch_size):
        batch_end = min(batch_start + batch_size, len(all_new_papers))
        current_batch = all_new_papers[batch_start:batch_end]
        
        log_progress(f"Processing batch {batch_start//batch_size + 1}: papers {batch_start+1}-{batch_end}")

        agent = make_agent(current_batch)

        try:
            history = await asyncio.wait_for(agent.run(max_steps=25), timeout=600)
            
            final_result = history.final_result()
            if final_result:
                # Save markdown summary
                batch_header = f"## Research Batch {batch_start//batch_size + 1} ({datetime.now().strftime('%Y-%m-%d %H:%M')})"
                append_markdown(f"{batch_header}\n\n{final_result}")

                # Try to extract and save structured data
                try:
                    if hasattr(history, 'structured_output') and history.structured_output:
                        append_structured(history.structured_output)
                    else:
                        # Fallback: save basic paper info
                        basic_data = [{
                            "title": p["title"],
                            "authors": p["authors"],
                            "year": p["year"],
                            "url": p["url"],
                            "processed_date": datetime.now().isoformat(),
                            "query_term": p["query_term"]
                        } for p in current_batch]
                        append_structured(basic_data)
                        
                except Exception as e:
                    log_progress(f"Error saving structured data: {str(e)}", "WARNING")

                log_progress(f"Successfully processed {len(current_batch)} papers")
            else:
                log_progress("Agent produced no final result", "WARNING")

        except asyncio.TimeoutError:
            log_progress("Agent timed out - continuing with next batch", "ERROR")
        except Exception as e:
            log_progress(f"Error running agent: {str(e)}", "ERROR")

        # Update visited papers
        visited.update([p["id"] for p in current_batch])
        save_visited_papers(visited)
        
        # Pause between batches
        if batch_end < len(all_new_papers):
            log_progress("Pausing before next batch...")
            await asyncio.sleep(3)

    log_progress(f"Research complete! Processed {len(all_new_papers)} new papers total.")

# ===============================
# Test Function
# ===============================

def test_arxiv_fetch():
    """Test the arXiv fetch function with a simple query"""
    print("Testing arXiv fetch...")
    
    try:
        papers = fetch_arxiv("momentum trading", max_results=3)
        print(f"✅ Successfully fetched {len(papers)} papers")
        
        if papers:
            print("\nFirst paper:")
            print(f"Title: {papers[0]['title']}")
            print(f"Authors: {', '.join(papers[0]['authors'])}")
            print(f"Year: {papers[0]['year']}")
            print(f"ID: {papers[0]['id']}")
        
        return True
    except Exception as e:
        print(f"❌ Test failed: {str(e)}")
        return False

# ===============================
# Manual Execution
# ===============================

# Test the fix first
if __name__ == "__main__":
    print("🔧 Enhanced arXiv research agent ready.")
    print("📊 Testing arXiv connection...")
    
    if test_arxiv_fetch():
        print("\n✅ Connection test passed!")
        print("📚 Run: await run_research() to start full research")
    else:
        print("\n❌ Connection test failed. Check your internet connection.")

# Uncomment to run immediately:
await run_research()

🔧 Enhanced arXiv research agent ready.
📊 Testing arXiv connection...
Testing arXiv fetch...
Fetching arXiv papers for query: 'momentum trading' (attempt 1)
Successfully fetched 3 papers for 'momentum trading'
✅ Successfully fetched 3 papers

First paper:
Title: Momentum universe shrinkage effect in price momentum
Authors: Jaehyung Choi, Sungsoo Choi, Wonseok Kang
Year: 2012
ID: 1211.6517v1

✅ Connection test passed!
📚 Run: await run_research() to start full research
[2025-09-06 17:46:31] INFO: Starting arXiv research...
[2025-09-06 17:46:31] INFO: Processing query 1/5: 'momentum trading'
Fetching arXiv papers for query: 'momentum trading' (attempt 1)
Successfully fetched 1 papers for 'momentum trading'
[2025-09-06 17:46:32] INFO: Found 1 total papers, 1 new ones
[2025-09-06 17:46:33] INFO: Processing query 2/5: 'momentum investing'
Fetching arXiv papers for query: 'momentum investing' (attempt 1)
Successfully fetched 5 papers for 'momentum investing'
[2025-09-06 17:46:33] INFO: Found 5

  gc.collect()


[2025-09-06 17:48:17] INFO: Successfully processed 3 papers
[2025-09-06 17:48:17] INFO: Pausing before next batch...
[2025-09-06 17:48:20] INFO: Processing batch 2: papers 4-6
INFO     [Agent] [34m🚀 Task: 
Analyze these 3 academic papers on momentum trading/investing:

[
  {
    "id": "2308.11294v1",
    "title": "Network Momentum across Asset Classes",
    "authors": [
      "Xingyue Pu",
      "Stephen Roberts",
      "Xiaowen Dong",
      "Stefan Zohren"
    ],
    "year": "2023",
    "url": "http://arxiv.org/abs/2308.11294v1",
    "abstract": "We investigate the concept of network momentum, a novel trading signal derived from momentum spillover across assets. Initially observed within the confines of pairwise economic and fundamental ties, such as the stock-bond connection of the same company and stocks linked through supply-demand chains, momentum spillover implies a propagation of momentum risk premium from one asset to another. The similarity of momentum risk premium, exemplifi