In [None]:
# works ok
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
import asyncio
from datetime import datetime
import json
import time
from pathlib import Path
import logging
from typing import List, Dict, Optional
import hashlib

# ===============================
# Quick Setup with Immediate Feedback
# ===============================

print("🔧 Initializing DrinksFoodLife Article Scraper...")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()
llm = ChatGoogle(model="gemini-2.5-flash")

print("✅ LLM initialized")

# ===============================
# Configuration
# ===============================

OUTPUT_DIR = Path("drinksfoodlife_output")
OUTPUT_DIR.mkdir(exist_ok=True)

WEBSITE_URL = "https://www.drinksfoodlife.com"

# ===============================
# Helper Functions
# ===============================

def create_markdown_file(articles: List[Dict], filename: str) -> None:
    """Create a markdown file with article information."""
    filepath = OUTPUT_DIR / filename
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write("# DrinksFoodLife.com Articles\n\n")
        f.write(f"*Scraped on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
        f.write(f"*Total Articles: {len(articles)}*\n\n")
        f.write("---\n\n")
        
        for idx, article in enumerate(articles, 1):
            f.write(f"## {idx}. {article.get('title', 'Untitled')}\n\n")
            f.write(f"**URL:** [{article.get('url', 'N/A')}]({article.get('url', '#')})\n\n")
            f.write(f"**Summary:**\n\n{article.get('summary', 'No summary available.')}\n\n")
            f.write("---\n\n")
    
    logger.info(f"📄 Markdown file created: {filepath}")
    print(f"✅ Markdown saved to: {filepath}")

def create_json_file(articles: List[Dict], filename: str) -> None:
    """Create a JSON file with article information."""
    filepath = OUTPUT_DIR / filename
    
    output_data = {
        "scraped_at": datetime.now().isoformat(),
        "source": WEBSITE_URL,
        "total_articles": len(articles),
        "articles": articles
    }
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    logger.info(f"📋 JSON file created: {filepath}")
    print(f"✅ JSON saved to: {filepath}")

def generate_unique_filename(base_name: str, extension: str) -> str:
    """Generate a unique filename with timestamp."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    return f"{base_name}_{timestamp}.{extension}"

# ===============================
# Main Scraping Function
# ===============================

async def scrape_drinksfoodlife_articles() -> List[Dict]:
    """
    Scrape articles from DrinksFoodLife.com using browser-use agent.
    Returns a list of article dictionaries with title, url, and summary.
    """
    
    logger.info("🚀 Starting article scraping from DrinksFoodLife.com...")
    print("🌐 Opening browser and navigating to website...")
    
    task = f"""
    Navigate to {WEBSITE_URL} and scrape articles from the homepage.
    
    For each article you find:
    1. Extract the exact article title
    2. Extract the complete article URL
    3. Click on the article to read the full content
    4. Create a concise summary of approximately 100 tokens that captures the main points
    5. Return to the homepage to continue finding more articles
    
    Try to collect at least 10-15 articles.
    
    Return the data as a JSON array with this exact structure:
    [
        {{
            "title": "Article Title",
            "url": "https://www.drinksfoodlife.com/article-url",
            "summary": "A 100-token summary of the article content..."
        }},
        ...
    ]
    
    Important:
    - Make sure URLs are complete and clickable
    - Summaries should be informative and capture key points
    - Focus on recent/main articles visible on the homepage
    """
    
    agent = Agent(
        task=task,
        llm=llm,
    )
    
    try:
        result = await agent.run()
        logger.info("✅ Scraping completed successfully")
        return result
    except Exception as e:
        logger.error(f"❌ Error during scraping: {str(e)}")
        raise

def parse_agent_result(result) -> List[Dict]:
    """
    Parse the agent result to extract article data.
    Handles various possible return formats.
    """
    articles = []
    
    logger.info("🔍 Parsing agent results...")
    
    # If result is already a list
    if isinstance(result, list):
        articles = result
    # If result has a specific method to get final output
    elif hasattr(result, 'final_result'):
        data = result.final_result()
        if isinstance(data, list):
            articles = data
        elif isinstance(data, str):
            try:
                articles = json.loads(data)
            except json.JSONDecodeError:
                logger.warning("⚠️ Could not parse result as JSON")
    # If result is a string (try to parse as JSON)
    elif isinstance(result, str):
        try:
            articles = json.loads(result)
        except json.JSONDecodeError:
            logger.warning("⚠️ Result is not valid JSON")
    
    # Validate article structure
    validated_articles = []
    for article in articles:
        if isinstance(article, dict) and 'title' in article and 'url' in article:
            validated_articles.append({
                'title': article.get('title', 'Untitled'),
                'url': article.get('url', ''),
                'summary': article.get('summary', 'No summary available.')
            })
    
    logger.info(f"✅ Successfully parsed {len(validated_articles)} articles")
    return validated_articles

# ===============================
# Main Execution
# ===============================

async def main():
    """Main execution function."""
    
    print("\n" + "="*50)
    print("🍽️  DRINKSFOODLIFE.COM ARTICLE SCRAPER")
    print("="*50 + "\n")
    
    start_time = time.time()
    
    try:
        # Scrape articles
        result = await scrape_drinksfoodlife_articles()
        
        # Parse results
        articles = parse_agent_result(result)
        
        if not articles:
            logger.error("❌ No articles were extracted. Check the agent output.")
            logger.info(f"Raw result type: {type(result)}")
            logger.info(f"Raw result: {result}")
            print("\n⚠️  No articles found. The website structure may have changed.")
            return
        
        print(f"\n✨ Successfully extracted {len(articles)} articles!\n")
        
        # Generate unique filenames
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        md_filename = f"drinksfoodlife_articles_{timestamp}.md"
        json_filename = f"drinksfoodlife_articles_{timestamp}.json"
        
        # Create output files
        create_markdown_file(articles, md_filename)
        create_json_file(articles, json_filename)
        
        # Summary
        elapsed_time = time.time() - start_time
        print("\n" + "="*50)
        print("📊 SCRAPING SUMMARY")
        print("="*50)
        print(f"✅ Articles scraped: {len(articles)}")
        print(f"⏱️  Time taken: {elapsed_time:.2f} seconds")
        print(f"📁 Output directory: {OUTPUT_DIR.absolute()}")
        print("="*50 + "\n")
        
        logger.info("✨ All done! Check the generated files.")
        
    except Exception as e:
        logger.error(f"❌ Fatal error: {str(e)}")
        print(f"\n❌ An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    # For Jupyter notebooks, use await directly instead of asyncio.run()
    try:
        get_ipython()  # This will succeed in Jupyter/IPython
        # We're in a notebook - the user should run: await main()
        print("\n⚠️  Running in Jupyter notebook detected!")
        print("📝 Please run this command in a new cell:")
        print("   await main()")
    except NameError:
        # Not in Jupyter - use asyncio.run()
        asyncio.run(main())
        

🔧 Initializing DrinksFoodLife Article Scraper...
✅ LLM initialized

⚠️  Running in Jupyter notebook detected!
📝 Please run this command in a new cell:
   await main()


In [9]:
# based off of previous - checks if article has been added
from browser_use import Agent, ChatGoogle
from dotenv import load_dotenv
import os
import asyncio
from datetime import datetime
import json
import time
from pathlib import Path
import logging
from typing import List, Dict, Optional, Set
import hashlib

# ===============================
# Quick Setup with Immediate Feedback
# ===============================

print("🔧 Initializing DrinksFoodLife Article Scraper...")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

load_dotenv()
llm = ChatGoogle(model="gemini-2.5-flash")

print("✅ LLM initialized")

# ===============================
# Configuration
# ===============================

OUTPUT_DIR = Path("drinksfoodlife_output")
OUTPUT_DIR.mkdir(exist_ok=True)

WEBSITE_URL = "https://www.drinksfoodlife.com"
ARTICLES_JSON_PATH = OUTPUT_DIR / "articles.json"

# ===============================
# JSON Management Functions
# ===============================

def load_existing_articles() -> Dict:
    """Load existing articles from articles.json or create new structure."""
    if ARTICLES_JSON_PATH.exists():
        try:
            with open(ARTICLES_JSON_PATH, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # Handle legacy format (list) vs new format (dict)
                if isinstance(data, list):
                    logger.info(f"🔄 Converting legacy format: {len(data)} articles")
                    # Convert list format to new dict format
                    articles_list = data
                    data = {
                        "created_at": datetime.now().isoformat(),
                        "last_updated": datetime.now().isoformat(),
                        "source": WEBSITE_URL,
                        "total_articles": len(articles_list),
                        "articles": articles_list
                    }
                    # Add article_id to existing articles if missing
                    for article in data['articles']:
                        if 'article_id' not in article:
                            article['article_id'] = get_article_hash(article.get('url', ''))
                        if 'added_at' not in article:
                            article['added_at'] = datetime.now().isoformat()
                
                logger.info(f"✅ Loaded {len(data.get('articles', []))} existing articles")
                return data
        except json.JSONDecodeError as e:
            logger.error(f"❌ Error reading articles.json: {e}")
            logger.info("🔄 Creating backup and starting fresh")
            # Backup corrupted file
            backup_path = OUTPUT_DIR / f"articles_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            ARTICLES_JSON_PATH.rename(backup_path)
        except Exception as e:
            logger.error(f"❌ Unexpected error loading articles.json: {e}")
            # Backup file for safety
            backup_path = OUTPUT_DIR / f"articles_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            ARTICLES_JSON_PATH.rename(backup_path)
            logger.info(f"📦 Backed up to: {backup_path}")
    
    # Return new structure
    return {
        "created_at": datetime.now().isoformat(),
        "last_updated": datetime.now().isoformat(),
        "source": WEBSITE_URL,
        "total_articles": 0,
        "articles": []
    }

def get_article_hash(url: str) -> str:
    """Generate a unique hash for an article URL."""
    return hashlib.md5(url.encode()).hexdigest()

def get_existing_urls(articles_data: Dict) -> Set[str]:
    """Extract set of existing article URLs."""
    return {article['url'] for article in articles_data.get('articles', [])}

def add_new_articles(existing_data: Dict, new_articles: List[Dict]) -> tuple[Dict, int]:
    """
    Add new articles to existing data, avoiding duplicates.
    Returns updated data and count of new articles added.
    """
    existing_urls = get_existing_urls(existing_data)
    added_count = 0
    
    for article in new_articles:
        url = article.get('url', '')
        if url and url not in existing_urls:
            # Add article hash for tracking
            article['article_id'] = get_article_hash(url)
            article['added_at'] = datetime.now().isoformat()
            existing_data['articles'].append(article)
            existing_urls.add(url)
            added_count += 1
            logger.info(f"➕ Added new article: {article.get('title', 'Untitled')}")
    
    # Update metadata
    existing_data['last_updated'] = datetime.now().isoformat()
    existing_data['total_articles'] = len(existing_data['articles'])
    
    return existing_data, added_count

def save_articles_json(data: Dict) -> None:
    """Save articles data to articles.json with proper formatting."""
    try:
        # Create a temporary file first
        temp_path = OUTPUT_DIR / "articles_temp.json"
        with open(temp_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False, sort_keys=False)
        
        # Validate the JSON was written correctly
        with open(temp_path, 'r', encoding='utf-8') as f:
            json.load(f)  # This will raise an error if JSON is invalid
        
        # If validation passed, replace the original file
        temp_path.replace(ARTICLES_JSON_PATH)
        logger.info(f"✅ Saved {data['total_articles']} articles to articles.json")
        
    except Exception as e:
        logger.error(f"❌ Error saving articles.json: {e}")
        raise

# ===============================
# Markdown Generation
# ===============================

def create_final_markdown(articles_data: Dict) -> None:
    """Create final markdown document from articles.json."""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    md_filename = f"drinksfoodlife_final_{timestamp}.md"
    filepath = OUTPUT_DIR / md_filename
    
    articles = articles_data.get('articles', [])
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write("# DrinksFoodLife.com - Complete Article Collection\n\n")
        f.write(f"*Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
        f.write(f"*Total Articles: {len(articles)}*\n\n")
        f.write(f"*Last Updated: {articles_data.get('last_updated', 'N/A')}*\n\n")
        f.write("---\n\n")
        
        # Sort articles by added_at date (newest first)
        sorted_articles = sorted(
            articles, 
            key=lambda x: x.get('added_at', ''), 
            reverse=True
        )
        
        for idx, article in enumerate(sorted_articles, 1):
            f.write(f"## {idx}. {article.get('title', 'Untitled')}\n\n")
            f.write(f"**URL:** [{article.get('url', 'N/A')}]({article.get('url', '#')})\n\n")
            
            if 'added_at' in article:
                added_date = datetime.fromisoformat(article['added_at']).strftime('%Y-%m-%d %H:%M')
                f.write(f"**Added:** {added_date}\n\n")
            
            f.write(f"**Summary:**\n\n{article.get('summary', 'No summary available.')}\n\n")
            f.write("---\n\n")
    
    logger.info(f"📄 Final markdown created: {filepath}")
    print(f"✅ Final markdown saved to: {filepath}")

# ===============================
# Main Scraping Function
# ===============================

async def scrape_entire_website() -> List[Dict]:
    """
    Comprehensively scrape all pages and articles from DrinksFoodLife.com.
    """
    
    logger.info("🚀 Starting comprehensive website scraping...")
    print("🌐 Opening browser and exploring entire website...")
    
    task = f"""
    Navigate to {WEBSITE_URL} and perform a COMPREHENSIVE scraping of the entire website.
    
    EXPLORATION STRATEGY:
    1. Start at the homepage
    2. Identify all navigation links, category pages, archive pages, and pagination
    3. Visit EVERY accessible page on the website
    4. For each article page you discover:
       - Extract the exact article title
       - Extract the complete article URL
       - Read the full article content
       - Create a concise summary of approximately 100 tokens
    
    IMPORTANT INSTRUCTIONS:
    - Explore ALL categories (recipes, drinks, lifestyle, etc.)
    - Follow ALL pagination links (page 2, 3, 4, etc.)
    - Check the footer for additional links to article archives
    - Look for "older posts", "next page", "view more" buttons
    - Visit tag pages and category pages
    - Be thorough - we want EVERY article on the site
    
    DO NOT STOP until you have:
    - Visited all navigation menu items
    - Followed all pagination to the end
    - Explored all category and tag pages
    - Collected articles from every accessible page
    
    Return data as a JSON array:
    [
        {{
            "title": "Article Title",
            "url": "https://www.drinksfoodlife.com/article-url",
            "summary": "A 100-token summary capturing key points of the article..."
        }},
        ...
    ]
    
    QUALITY REQUIREMENTS:
    - URLs must be complete and clickable
    - Summaries should be informative and capture main points
    - No duplicate articles
    - Include articles from all sections of the website
    
    Take your time and be thorough. Aim for at least 50+ articles if the site has them.
    """
    
    agent = Agent(
        task=task,
        llm=llm,
    )
    
    try:
        result = await agent.run()
        logger.info("✅ Comprehensive scraping completed")
        return result
    except Exception as e:
        logger.error(f"❌ Error during scraping: {str(e)}")
        raise

def parse_agent_result(result) -> List[Dict]:
    """
    Parse the agent result to extract article data.
    Handles various possible return formats.
    """
    articles = []
    
    logger.info("🔍 Parsing agent results...")
    
    # Handle different result types
    if isinstance(result, list):
        articles = result
    elif hasattr(result, 'final_result'):
        data = result.final_result()
        if isinstance(data, list):
            articles = data
        elif isinstance(data, str):
            try:
                articles = json.loads(data)
            except json.JSONDecodeError:
                logger.warning("⚠️ Could not parse result as JSON")
    elif isinstance(result, str):
        try:
            articles = json.loads(result)
        except json.JSONDecodeError:
            logger.warning("⚠️ Result is not valid JSON")
    elif isinstance(result, dict):
        # Maybe it's wrapped in a dict
        if 'articles' in result:
            articles = result['articles']
        elif 'data' in result:
            articles = result['data']
    
    # Validate and clean article data
    validated_articles = []
    seen_urls = set()
    
    for article in articles:
        if not isinstance(article, dict):
            continue
            
        url = article.get('url', '').strip()
        title = article.get('title', '').strip()
        
        # Skip if missing required fields or duplicate
        if not url or not title or url in seen_urls:
            continue
        
        validated_articles.append({
            'title': title,
            'url': url,
            'summary': article.get('summary', 'No summary available.').strip()
        })
        seen_urls.add(url)
    
    logger.info(f"✅ Successfully parsed {len(validated_articles)} unique articles")
    return validated_articles

# ===============================
# Main Execution
# ===============================

async def main():
    """Main execution function with incremental JSON updates."""
    
    print("\n" + "="*60)
    print("🍽️  DRINKSFOODLIFE.COM COMPREHENSIVE SCRAPER")
    print("="*60 + "\n")
    
    start_time = time.time()
    
    try:
        # Load existing articles
        print("📂 Checking for existing articles...")
        articles_data = load_existing_articles()
        initial_count = articles_data['total_articles']
        print(f"📊 Found {initial_count} existing articles\n")
        
        # Scrape the entire website
        print("🔍 Starting comprehensive website exploration...")
        print("⏳ This may take several minutes...\n")
        result = await scrape_entire_website()
        
        # Parse results
        new_articles = parse_agent_result(result)
        
        if not new_articles:
            logger.error("❌ No articles were extracted. Check the agent output.")
            logger.info(f"Raw result type: {type(result)}")
            print("\n⚠️  No new articles found.")
            return
        
        print(f"\n✨ Discovered {len(new_articles)} articles from scraping!\n")
        
        # Add new articles (avoiding duplicates)
        print("🔄 Processing and deduplicating articles...")
        articles_data, added_count = add_new_articles(articles_data, new_articles)
        
        # Save updated articles.json
        print("💾 Saving to articles.json...")
        save_articles_json(articles_data)
        
        # Create final markdown
        print("📝 Generating final markdown document...")
        create_final_markdown(articles_data)
        
        # Summary
        elapsed_time = time.time() - start_time
        print("\n" + "="*60)
        print("📊 SCRAPING SUMMARY")
        print("="*60)
        print(f"✅ Total articles in database: {articles_data['total_articles']}")
        print(f"➕ New articles added: {added_count}")
        print(f"📋 Articles skipped (duplicates): {len(new_articles) - added_count}")
        print(f"⏱️  Time taken: {elapsed_time:.2f} seconds")
        print(f"📁 Output directory: {OUTPUT_DIR.absolute()}")
        print(f"📄 Main file: {ARTICLES_JSON_PATH.name}")
        print("="*60 + "\n")
        
        logger.info("✨ All done! Check articles.json and the final markdown file.")
        
    except Exception as e:
        logger.error(f"❌ Fatal error: {str(e)}")
        print(f"\n❌ An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

if __name__ == "__main__":
    # For Jupyter notebooks, use await directly instead of asyncio.run()
    try:
        get_ipython()  # This will succeed in Jupyter/IPython
        # We're in a notebook - the user should run: await main()
        print("\n⚠️  Running in Jupyter notebook detected!")
        print("📝 Please run this command in a new cell:")
        print("   await main()")
    except NameError:
        # Not in Jupyter - use asyncio.run()
        asyncio.run(main())

🔧 Initializing DrinksFoodLife Article Scraper...
✅ LLM initialized

⚠️  Running in Jupyter notebook detected!
📝 Please run this command in a new cell:
   await main()


In [10]:
# needed to run
await main()


🍽️  DRINKSFOODLIFE.COM COMPREHENSIVE SCRAPER

📂 Checking for existing articles...
INFO     [__main__] 🔄 Converting legacy format: 14 articles
INFO     [__main__] ✅ Loaded 14 existing articles
📊 Found 14 existing articles

🔍 Starting comprehensive website exploration...
⏳ This may take several minutes...

INFO     [__main__] 🚀 Starting comprehensive website scraping...
🌐 Opening browser and exploring entire website...
INFO     [Agent] [34m🚀 Task: 
    Navigate to https://www.drinksfoodlife.com and perform a COMPREHENSIVE scraping of the entire website.

    EXPLORATION STRATEGY:
    1. Start at the homepage
    2. Identify all navigation links, category pages, archive pages, and pagination
    3. Visit EVERY accessible page on the website
    4. For each article page you discover:
       - Extract the exact article title
       - Extract the complete article URL
       - Read the full article content
       - Create a concise summary of approximately 100 tokens

    IMPORTANT INSTRUCT

  for i, style_index in enumerate(style_indices):


INFO     [Agent] 

INFO     [Agent] 📍 Step 12:
INFO     [Agent]   [32m👍 Eval: Successfully scrolled down the page, but the main navigation links are still not visible. Verdict: Failure (goal not achieved)[0m
INFO     [Agent]   [34m🎯 Next goal: Input 'recipes' into the search bar and click the search button to find a category page or related articles.[0m
INFO     [Agent]   🦾 [34m[ACTION 1/2][0m input_text: index: 5, text: recipes, clear_existing: True
INFO     [BrowserSession] ⌨️ Typed "recipes" into element with index 5
INFO     [Agent]   🦾 [34m[ACTION 2/2][0m click_element_by_index: index: 6, while_holding_ctrl: False
INFO     [tools] 🖱️ Clicked element with index 6
INFO     [Agent] 

INFO     [Agent] 📍 Step 13:
INFO     [Agent]   [32m👍 Eval: Successfully input 'recipes' into the search bar and clicked the search button. Verdict: Success[0m
INFO     [Agent]   [34m🎯 Next goal: Examine the current page to confirm that search results for 'recipes' are displayed and then begin 