In [None]:
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
import asyncio
from datetime import datetime
import json
import time
from pathlib import Path
import logging
from typing import List, Dict, Optional
import hashlib

# ===============================
# Quick Setup with Immediate Feedback
# ===============================

print("🔧 Initializing DrinksFoodLife Article Scraper...")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()
llm = ChatGoogle(model="gemini-2.5-flash")

print("✅ LLM initialized")

# ===============================
# Configuration
# ===============================

OUTPUT_DIR = Path("drinksfoodlife_output")
OUTPUT_DIR.mkdir(exist_ok=True)

WEBSITE_URL = "https://www.drinksfoodlife.com"

# ===============================
# Helper Functions
# ===============================

def create_markdown_file(articles: List[Dict], filename: str) -> None:
    """Create a markdown file with article information."""
    filepath = OUTPUT_DIR / filename
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write("# DrinksFoodLife.com Articles\n\n")
        f.write(f"*Scraped on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
        f.write(f"*Total Articles: {len(articles)}*\n\n")
        f.write("---\n\n")
        
        for idx, article in enumerate(articles, 1):
            f.write(f"## {idx}. {article.get('title', 'Untitled')}\n\n")
            f.write(f"**URL:** [{article.get('url', 'N/A')}]({article.get('url', '#')})\n\n")
            f.write(f"**Summary:**\n\n{article.get('summary', 'No summary available.')}\n\n")
            f.write("---\n\n")
    
    logger.info(f"📄 Markdown file created: {filepath}")
    print(f"✅ Markdown saved to: {filepath}")

def create_json_file(articles: List[Dict], filename: str) -> None:
    """Create a JSON file with article information."""
    filepath = OUTPUT_DIR / filename
    
    output_data = {
        "scraped_at": datetime.now().isoformat(),
        "source": WEBSITE_URL,
        "total_articles": len(articles),
        "articles": articles
    }
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    logger.info(f"📋 JSON file created: {filepath}")
    print(f"✅ JSON saved to: {filepath}")

def generate_unique_filename(base_name: str, extension: str) -> str:
    """Generate a unique filename with timestamp."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    return f"{base_name}_{timestamp}.{extension}"

# ===============================
# Main Scraping Function
# ===============================

async def scrape_drinksfoodlife_articles() -> List[Dict]:
    """
    Scrape articles from DrinksFoodLife.com using browser-use agent.
    Returns a list of article dictionaries with title, url, and summary.
    """
    
    logger.info("🚀 Starting article scraping from DrinksFoodLife.com...")
    print("🌐 Opening browser and navigating to website...")
    
    task = f"""
    Navigate to {WEBSITE_URL} and scrape articles from the homepage.
    
    For each article you find:
    1. Extract the exact article title
    2. Extract the complete article URL
    3. Click on the article to read the full content
    4. Create a concise summary of approximately 100 tokens that captures the main points
    5. Return to the homepage to continue finding more articles
    
    Try to collect at least 10-15 articles.
    
    Return the data as a JSON array with this exact structure:
    [
        {{
            "title": "Article Title",
            "url": "https://www.drinksfoodlife.com/article-url",
            "summary": "A 100-token summary of the article content..."
        }},
        ...
    ]
    
    Important:
    - Make sure URLs are complete and clickable
    - Summaries should be informative and capture key points
    - Focus on recent/main articles visible on the homepage
    """
    
    agent = Agent(
        task=task,
        llm=llm,
    )
    
    try:
        result = await agent.run()
        logger.info("✅ Scraping completed successfully")
        return result
    except Exception as e:
        logger.error(f"❌ Error during scraping: {str(e)}")
        raise

def parse_agent_result(result) -> List[Dict]:
    """
    Parse the agent result to extract article data.
    Handles various possible return formats.
    """
    articles = []
    
    logger.info("🔍 Parsing agent results...")
    
    # If result is already a list
    if isinstance(result, list):
        articles = result
    # If result has a specific method to get final output
    elif hasattr(result, 'final_result'):
        data = result.final_result()
        if isinstance(data, list):
            articles = data
        elif isinstance(data, str):
            try:
                articles = json.loads(data)
            except json.JSONDecodeError:
                logger.warning("⚠️ Could not parse result as JSON")
    # If result is a string (try to parse as JSON)
    elif isinstance(result, str):
        try:
            articles = json.loads(result)
        except json.JSONDecodeError:
            logger.warning("⚠️ Result is not valid JSON")
    
    # Validate article structure
    validated_articles = []
    for article in articles:
        if isinstance(article, dict) and 'title' in article and 'url' in article:
            validated_articles.append({
                'title': article.get('title', 'Untitled'),
                'url': article.get('url', ''),
                'summary': article.get('summary', 'No summary available.')
            })
    
    logger.info(f"✅ Successfully parsed {len(validated_articles)} articles")
    return validated_articles

# ===============================
# Main Execution
# ===============================

async def main():
    """Main execution function."""
    
    print("\n" + "="*50)
    print("🍽️  DRINKSFOODLIFE.COM ARTICLE SCRAPER")
    print("="*50 + "\n")
    
    start_time = time.time()
    
    try:
        # Scrape articles
        result = await scrape_drinksfoodlife_articles()
        
        # Parse results
        articles = parse_agent_result(result)
        
        if not articles:
            logger.error("❌ No articles were extracted. Check the agent output.")
            logger.info(f"Raw result type: {type(result)}")
            logger.info(f"Raw result: {result}")
            print("\n⚠️  No articles found. The website structure may have changed.")
            return
        
        print(f"\n✨ Successfully extracted {len(articles)} articles!\n")
        
        # Generate unique filenames
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        md_filename = f"drinksfoodlife_articles_{timestamp}.md"
        json_filename = f"drinksfoodlife_articles_{timestamp}.json"
        
        # Create output files
        create_markdown_file(articles, md_filename)
        create_json_file(articles, json_filename)
        
        # Summary
        elapsed_time = time.time() - start_time
        print("\n" + "="*50)
        print("📊 SCRAPING SUMMARY")
        print("="*50)
        print(f"✅ Articles scraped: {len(articles)}")
        print(f"⏱️  Time taken: {elapsed_time:.2f} seconds")
        print(f"📁 Output directory: {OUTPUT_DIR.absolute()}")
        print("="*50 + "\n")
        
        logger.info("✨ All done! Check the generated files.")
        
    except Exception as e:
        logger.error(f"❌ Fatal error: {str(e)}")
        print(f"\n❌ An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    asyncio.run(main())

ModuleNotFoundError: No module named 'langchain_google_genai'