In [1]:
# Install required packages
"""
Install all required dependencies for web scraping and LLM integration
"""

# Install required packages
!pip install -q requests beautifulsoup4 openai tqdm fake-useragent selenium webdriver-manager lxml



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Google Map Web Scraping with LLM

## Link 1

In [5]:
# üîç Intelligent Web Scraper: Google Maps + LLM-Powered Page Selection
# Enhanced Google Colab Notebook with Smart Page Selection

# ============================================================================
# SECTION 1: SETUP AND INSTALLATIONS
# ============================================================================



# Import all necessary libraries
import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from tqdm import tqdm
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

# OpenAI for LLM integration
import openai

# Selenium imports (for dynamic content if needed)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# print("‚úÖ All packages installed successfully!")

# ============================================================================
# SECTION 2: CONFIGURATION AND API SETUP
# ============================================================================

"""
Configure API keys and scraping parameters
"""

# Set your OpenAI API Key
# For Google Colab, use userdata secrets or set directly
try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    # Fallback for local environment
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

# Initialize OpenAI client
openai.api_key = OPENAI_API_KEY

# Scraping configuration
MAX_CRAWL_DEPTH = 2  # How deep to crawl internal links
MAX_PAGES = 10  # Maximum pages to discover during crawl
TOP_PAGES_TO_ANALYZE = 6  # Number of most relevant pages to analyze
REQUEST_TIMEOUT = 10  # Seconds
RATE_LIMIT_DELAY = 1  # Seconds between requests

# Initialize user agent rotator
ua = UserAgent()

print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# SECTION 3: GOOGLE MAPS URL PROCESSOR
# ============================================================================

"""
Extract business website URL from Google Maps link
"""

def setup_selenium_driver():
    """Initialize Selenium WebDriver for dynamic content"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    """
    Extract the main business website URL from a Google Maps link
    
    Args:
        maps_url (str): Google Maps URL
        use_selenium (bool): Whether to use Selenium for dynamic content
    
    Returns:
        str: Extracted website URL or None
    """
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            
            # Wait for page to load
            time.sleep(3)
            
            # Look for website link with multiple selectors
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            # Fallback: search in page source
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Find all links
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
                        
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    # Fallback to requests
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Search for website links in HTML
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# SECTION 4: WEBSITE CRAWLER
# ============================================================================

"""
Crawl all internal pages of the website
"""

def is_valid_url(url, base_domain):
    """Check if URL is valid and belongs to the same domain"""
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        
        # Check if same domain
        if parsed.netloc != base_parsed.netloc:
            return False
        
        # Skip common file extensions and external resources
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        
        # Skip common non-content paths
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    """
    Crawl website and collect all internal page URLs
    
    Args:
        start_url (str): Starting URL
        max_depth (int): Maximum crawl depth
        max_pages (int): Maximum number of pages to crawl
    
    Returns:
        list: List of unique page URLs
    """
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url, 0)])  # (url, depth)
    pages = []
    
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        
        if current_url in visited or depth > max_depth:
            continue
        
        visited.add(current_url)
        
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            
            if response.status_code != 200:
                continue
            
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            
            # Parse page for more links
            soup = BeautifulSoup(response.content, 'html.parser')
            
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                # Remove fragments and query parameters for deduplication
                clean_url = absolute_url.split('#')[0].split('?')[0]
                
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            
            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)
            
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# SECTION 5: LLM-POWERED PAGE SELECTION
# ============================================================================

"""
Use LLM to intelligently select the most relevant pages for data extraction
"""

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    """
    Use LLM to select the most relevant pages for business information extraction
    
    Args:
        page_urls (list): List of all discovered page URLs
        top_n (int): Number of top pages to select
    
    Returns:
        list: List of selected page URLs
    """
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    
    # Create a numbered list of URLs for the LLM
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your task is to identify the most relevant pages 
from a website that would contain business information such as company details, services, contact information, 
and business description."""
    
    user_prompt = f"""I have crawled a business website and found {len(page_urls)} pages. 
Please analyze the URLs and select the TOP {top_n} most relevant pages that would likely contain:
- Company name and overview
- About/Company information
- Services or products offered
- Contact information (email, phone, address)
- Social media links
- Business description

Here are all the discovered page URLs:
{url_list}

Prioritize pages like:
- Home/Index pages
- About pages
- Services/Products pages
- Contact pages
- Company/Team pages
- Portfolio/Work pages

Return ONLY a valid JSON array containing the numbers of the selected pages (1-indexed).
Example format: [1, 3, 5, 7, 9, 12, 15]

Your response must be ONLY the JSON array, nothing else."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```'):
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        # Parse JSON array
        selected_indices = json.loads(content)
        
        # Convert 1-indexed to 0-indexed and get URLs
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        
        return selected_urls
        
    except Exception as e:
        print(f"‚ùå Error in LLM page selection: {e}")
        print("‚ö†Ô∏è Falling back to heuristic selection...")
        
        # Fallback: Use keyword-based heuristic
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company']
        scored_pages = []
        
        for url in page_urls:
            url_lower = url.lower()
            score = sum(1 for keyword in priority_keywords if keyword in url_lower)
            # Prioritize shorter URLs (often more important)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        
        # Sort by score and take top N
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        
        print(f"‚úÖ Heuristically selected {len(selected)} pages")
        return selected

# ============================================================================
# SECTION 6: CONTENT EXTRACTOR
# ============================================================================

"""
Extract clean text content from web pages
"""

def extract_page_content(url):
    """
    Extract visible text content from a webpage
    
    Args:
        url (str): Page URL
    
    Returns:
        str: Extracted text content
    """
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        
        if response.status_code != 200:
            return ""
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(['script', 'style', 'nav', 'footer', 'header']):
            script.decompose()
        
        # Get text
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
    
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return ""

# ============================================================================
# SECTION 7: LLM-POWERED CONSOLIDATED DATA EXTRACTION
# ============================================================================

"""
Use LLM to extract all business data from combined page content
"""

def extract_business_data_with_llm(page_contents, main_website_url):
    """
    Use LLM to extract consolidated business information from multiple pages
    
    Args:
        page_contents (dict): Dictionary mapping URLs to their text content
        main_website_url (str): Main website URL
    
    Returns:
        dict: Consolidated business information
    """
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    # Combine all page contents with clear separation
    combined_content = ""
    for url, content in page_contents.items():
        # Truncate individual pages if too long
        truncated_content = content[:8000] if len(content) > 8000 else content
        combined_content += f"\n\n--- PAGE: {url} ---\n{truncated_content}\n"
    
    # Limit total content size
    max_total_chars = 40000
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated due to length...]"
    
    system_prompt = """You are an expert business data extraction assistant. Analyze the provided webpage content 
from multiple pages of a business website and extract comprehensive, accurate business information. 
Consolidate information from all pages to provide the most complete picture."""
    
    user_prompt = f"""Analyze the content from {len(page_contents)} pages of this business website: {main_website_url}

Extract and return a SINGLE consolidated JSON object with these exact fields:

1. company_name: The official business/company name
2. company_main_url: The main website URL ({main_website_url})
3. emails: Array of ALL unique email addresses found across all pages
4. contact_numbers: Array of ALL unique phone numbers found (include country code if present)
5. social_media_links: Array of ALL social media profile URLs (Facebook, Instagram, LinkedIn, Twitter/X, YouTube, TikTok, etc.)
6. summary: A comprehensive 3-5 sentence summary describing:
   - What the business does
   - Main services/products offered
   - Key specialties or unique offerings
   - Target market/audience if mentioned

IMPORTANT:
- Use null for fields where no information is found
- For arrays, return empty array [] if no items found
- Deduplicate all arrays (no repeated emails, phones, or social links)
- The summary should be detailed and informative, covering all services/activities mentioned

Combined website content:
{combined_content}

Return ONLY a valid JSON object with these exact field names. No additional text or explanation."""

    try:
        print("  ‚è≥ Sending request to LLM (this may take 10-20 seconds)...")
        
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=2000
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```'):
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        # Parse JSON
        extracted_data = json.loads(content)
        
        # Ensure all required fields exist
        required_fields = ["company_name", "company_main_url", "emails", "contact_numbers", 
                          "social_media_links", "summary"]
        for field in required_fields:
            if field not in extracted_data:
                extracted_data[field] = None if field not in ["emails", "contact_numbers", "social_media_links"] else []
        
        print("‚úÖ Data extraction successful!")
        return extracted_data
        
    except Exception as e:
        print(f"‚ùå LLM extraction error: {e}")
        return {
            "company_name": None,
            "company_main_url": main_website_url,
            "emails": [],
            "contact_numbers": [],
            "social_media_links": [],
            "summary": None,
            "error": str(e)
        }

# ============================================================================
# SECTION 8: MAIN ORCHESTRATION FUNCTION
# ============================================================================

"""
Main function to orchestrate the entire scraping pipeline
"""

def scrape_business_data(google_maps_url):
    """
    Complete pipeline to scrape business data from Google Maps URL
    
    Args:
        google_maps_url (str): Google Maps URL of the business
    
    Returns:
        tuple: (extracted_data dict, business_name str)
    """
    print("=" * 80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("=" * 80)
    
    # Step 1: Extract website from Google Maps
    website_url = extract_website_from_google_maps(google_maps_url)
    
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None
    
    print(f"\nüìç Main Website: {website_url}\n")
    
    # Step 2: Crawl website to discover all pages
    all_pages = crawl_website(website_url)
    
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None
    
    print(f"\nüìÑ Discovered {len(all_pages)} total pages")
    
    # Step 3: Use LLM to select most relevant pages
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None
    
    # Step 4: Extract content from selected pages
    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        content = extract_page_content(page_url)
        if content:
            page_contents[page_url] = content
        time.sleep(RATE_LIMIT_DELAY)
    
    print(f"‚úÖ Extracted content from {len(page_contents)} pages")
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None
    
    # Step 5: Use LLM to extract consolidated business data
    extracted_data = extract_business_data_with_llm(page_contents, website_url)
    
    # Extract business name for filename
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        # Clean business name for filename
        business_name = re.sub(r'[^\w\s-]', '', business_name)
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()
    
    print("\n" + "=" * 80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("=" * 80)
    
    return extracted_data, business_name

# ============================================================================
# SECTION 9: DATA SAVING AND DISPLAY
# ============================================================================

"""
Save and display extraction results
"""

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    """
    Save extraction results to JSON file with business name
    
    Args:
        extracted_data (dict): Extracted business data
        business_name (str): Business name for filename
        all_pages_count (int): Total pages discovered
        selected_pages_count (int): Pages analyzed
    
    Returns:
        str: Output filename
    """
    # Create filename based on business name
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent page selection",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    """Display a formatted summary of extracted data"""
    print("\n" + "=" * 80)
    print("üìä EXTRACTION SUMMARY")
    print("=" * 80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            # Extract platform name
            platform = "Unknown"
            if 'facebook.com' in link:
                platform = "Facebook"
            elif 'instagram.com' in link:
                platform = "Instagram"
            elif 'linkedin.com' in link:
                platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link:
                platform = "Twitter/X"
            elif 'youtube.com' in link:
                platform = "YouTube"
            elif 'tiktok.com' in link:
                platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:")
    print(f"   {summary}")
    
    print("\n" + "=" * 80)

# ============================================================================
# SECTION 10: EXECUTION
# ============================================================================

"""
Run the complete scraping pipeline
"""

# INPUT: Paste your Google Maps URL here
GOOGLE_MAPS_URL = "https://www.google.com/maps/place/Xiao+Chi+Jie/@47.6135353,-122.2003497,1017m/data=!3m2!1e3!5s0x54906c87f6b05be7:0x7257abf958f252ea!4m6!3m5!1s0x54906d9668cef7b3:0xb7b3f7bd67692ab2!8m2!3d47.6126588!4d-122.1986349!16s%2Fg%2F11ghnpwhdz?entry=ttu&g_ep=EgoyMDI1MTExMS4wIKXMDSoASAFQAw%3D%3D"

# Execute the pipeline
if __name__ == "__main__":
    # Validate API key
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
        print("In Google Colab: Use Secrets (key icon) to add OPENAI_API_KEY")
        print("Locally: Set environment variable or update the code")
    else:
        # Store initial counts
        all_pages_discovered = 0
        pages_analyzed = 0
        
        # Run the scraper
        extracted_data, business_name = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            # Save results with business name in filename
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                TOP_PAGES_TO_ANALYZE
            )
            
            # Display summary
            display_summary(extracted_data)
            
            # Download file in Colab
            try:
                from google.colab import files
                files.download(filename)
                print(f"\n‚¨áÔ∏è Downloading: {filename}")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Please check the Google Maps URL and try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚ö†Ô∏è Could not extract website URL
‚ùå Failed to extract website URL from Google Maps
‚ùå No data extracted. Please check the Google Maps URL and try again.

‚úÖ Script execution complete!


## Link 2


In [21]:
# üîç Intelligent Web Scraper: Google Maps + LLM-Powered Page Selection
# Enhanced Google Colab Notebook with Smart Page Selection

# ============================================================================
# SECTION 1: SETUP AND INSTALLATIONS
# ============================================================================



# Import all necessary libraries
import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from tqdm import tqdm
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

# OpenAI for LLM integration
import openai

# Selenium imports (for dynamic content if needed)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# print("‚úÖ All packages installed successfully!")

# ============================================================================
# SECTION 2: CONFIGURATION AND API SETUP
# ============================================================================

"""
Configure API keys and scraping parameters
"""

# Set your OpenAI API Key
# For Google Colab, use userdata secrets or set directly
try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    # Fallback for local environment
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

# Initialize OpenAI client
openai.api_key = OPENAI_API_KEY

# Scraping configuration
MAX_CRAWL_DEPTH = 2  # How deep to crawl internal links
MAX_PAGES = 30  # Maximum pages to discover during crawl
TOP_PAGES_TO_ANALYZE = 30  # Number of most relevant pages to analyze
REQUEST_TIMEOUT = 10  # Seconds
RATE_LIMIT_DELAY = 1  # Seconds between requests

# Initialize user agent rotator
ua = UserAgent()

print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# SECTION 3: GOOGLE MAPS URL PROCESSOR
# ============================================================================

"""
Extract business website URL from Google Maps link
"""

def setup_selenium_driver():
    """Initialize Selenium WebDriver for dynamic content"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    """
    Extract the main business website URL from a Google Maps link
    
    Args:
        maps_url (str): Google Maps URL
        use_selenium (bool): Whether to use Selenium for dynamic content
    
    Returns:
        str: Extracted website URL or None
    """
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            
            # Wait for page to load
            time.sleep(3)
            
            # Look for website link with multiple selectors
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            # Fallback: search in page source
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Find all links
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
                        
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    # Fallback to requests
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Search for website links in HTML
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# SECTION 4: WEBSITE CRAWLER
# ============================================================================

"""
Crawl all internal pages of the website
"""

def is_valid_url(url, base_domain):
    """Check if URL is valid and belongs to the same domain"""
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        
        # Check if same domain
        if parsed.netloc != base_parsed.netloc:
            return False
        
        # Skip common file extensions and external resources
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        
        # Skip common non-content paths
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    """
    Crawl website and collect all internal page URLs
    
    Args:
        start_url (str): Starting URL
        max_depth (int): Maximum crawl depth
        max_pages (int): Maximum number of pages to crawl
    
    Returns:
        list: List of unique page URLs
    """
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url, 0)])  # (url, depth)
    pages = []
    
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        
        if current_url in visited or depth > max_depth:
            continue
        
        visited.add(current_url)
        
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            
            if response.status_code != 200:
                continue
            
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            
            # Parse page for more links
            soup = BeautifulSoup(response.content, 'html.parser')
            
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                # Remove fragments and query parameters for deduplication
                clean_url = absolute_url.split('#')[0].split('?')[0]
                
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            
            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)
            
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# SECTION 5: LLM-POWERED PAGE SELECTION
# ============================================================================

"""
Use LLM to intelligently select the most relevant pages for data extraction
"""

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    """
    Use LLM to select the most relevant pages for business information extraction
    
    Args:
        page_urls (list): List of all discovered page URLs
        top_n (int): Number of top pages to select
    
    Returns:
        list: List of selected page URLs
    """
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    
    # Create a numbered list of URLs for the LLM
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your task is to identify the most relevant pages 
from a website that would contain business information such as company details, services, contact information, 
and business description."""
    
    user_prompt = f"""I have crawled a business website and found {len(page_urls)} pages. 
Please analyze the URLs and select the TOP {top_n} most relevant pages that would likely contain:
- Company name and overview
- About/Company information
- Services or products offered
- Contact information (email, phone, address)
- Social media links
- Business description

Here are all the discovered page URLs:
{url_list}

Prioritize pages like:
- Home/Index pages
- About pages
- Services/Products pages
- Contact pages
- Company/Team pages
- Portfolio/Work pages

Return ONLY a valid JSON array containing the numbers of the selected pages (1-indexed).
Example format: [1, 3, 5, 7, 9, 12, 15]

Your response must be ONLY the JSON array, nothing else."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```'):
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        # Parse JSON array
        selected_indices = json.loads(content)
        
        # Convert 1-indexed to 0-indexed and get URLs
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        
        return selected_urls
        
    except Exception as e:
        print(f"‚ùå Error in LLM page selection: {e}")
        print("‚ö†Ô∏è Falling back to heuristic selection...")
        
        # Fallback: Use keyword-based heuristic
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company']
        scored_pages = []
        
        for url in page_urls:
            url_lower = url.lower()
            score = sum(1 for keyword in priority_keywords if keyword in url_lower)
            # Prioritize shorter URLs (often more important)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        
        # Sort by score and take top N
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        
        print(f"‚úÖ Heuristically selected {len(selected)} pages")
        return selected

# ============================================================================
# SECTION 6: CONTENT EXTRACTOR
# ============================================================================

"""
Extract clean text content from web pages
"""

def extract_page_content(url):
    """
    Extract visible text content from a webpage
    
    Args:
        url (str): Page URL
    
    Returns:
        str: Extracted text content
    """
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        
        if response.status_code != 200:
            return ""
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(['script', 'style', 'nav', 'footer', 'header']):
            script.decompose()
        
        # Get text
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
    
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return ""

# ============================================================================
# SECTION 7: LLM-POWERED CONSOLIDATED DATA EXTRACTION
# ============================================================================

"""
Use LLM to extract all business data from combined page content
"""

def extract_business_data_with_llm(page_contents, main_website_url):
    """
    Use LLM to extract consolidated business information from multiple pages
    
    Args:
        page_contents (dict): Dictionary mapping URLs to their text content
        main_website_url (str): Main website URL
    
    Returns:
        dict: Consolidated business information
    """
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    # Combine all page contents with clear separation
    combined_content = ""
    for url, content in page_contents.items():
        # Truncate individual pages if too long
        truncated_content = content[:8000] if len(content) > 8000 else content
        combined_content += f"\n\n--- PAGE: {url} ---\n{truncated_content}\n"
    
    # Limit total content size
    max_total_chars = 40000
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated due to length...]"
    
    system_prompt = """You are an expert business data extraction assistant. Analyze the provided webpage content 
from multiple pages of a business website and extract comprehensive, accurate business information. 
Consolidate information from all pages to provide the most complete picture."""
    
    user_prompt = f"""Analyze the content from {len(page_contents)} pages of this business website: {main_website_url}

Extract and return a SINGLE consolidated JSON object with these exact fields:

1. company_name: The official business/company name
2. company_main_url: The main website URL ({main_website_url})
3. emails: Array of ALL unique email addresses found across all pages
4. contact_numbers: Array of ALL unique phone numbers found (include country code if present)
5. social_media_links: Array of ALL social media profile URLs (Facebook, Instagram, LinkedIn, Twitter/X, YouTube, TikTok, etc.)
6. summary: A comprehensive 3-5 sentence summary describing:
   - What the business does
   - Main services/products offered
   - Key specialties or unique offerings
   - Target market/audience if mentioned

IMPORTANT:
- Use null for fields where no information is found
- For arrays, return empty array [] if no items found
- Deduplicate all arrays (no repeated emails, phones, or social links)
- The summary should be detailed and informative, covering all services/activities mentioned

Combined website content:
{combined_content}

Return ONLY a valid JSON object with these exact field names. No additional text or explanation."""

    try:
        print("  ‚è≥ Sending request to LLM (this may take 10-20 seconds)...")
        
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=2000
        )
        
        # Extract JSON from response
        content = response.choices[0].message.content.strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```'):
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        # Parse JSON
        extracted_data = json.loads(content)
        
        # Ensure all required fields exist
        required_fields = ["company_name", "company_main_url", "emails", "contact_numbers", 
                          "social_media_links", "summary"]
        for field in required_fields:
            if field not in extracted_data:
                extracted_data[field] = None if field not in ["emails", "contact_numbers", "social_media_links"] else []
        
        print("‚úÖ Data extraction successful!")
        return extracted_data
        
    except Exception as e:
        print(f"‚ùå LLM extraction error: {e}")
        return {
            "company_name": None,
            "company_main_url": main_website_url,
            "emails": [],
            "contact_numbers": [],
            "social_media_links": [],
            "summary": None,
            "error": str(e)
        }

# ============================================================================
# SECTION 8: MAIN ORCHESTRATION FUNCTION
# ============================================================================

"""
Main function to orchestrate the entire scraping pipeline
"""

def scrape_business_data(google_maps_url):
    """
    Complete pipeline to scrape business data from Google Maps URL
    
    Args:
        google_maps_url (str): Google Maps URL of the business
    
    Returns:
        tuple: (extracted_data dict, business_name str)
    """
    print("=" * 80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("=" * 80)
    
    # Step 1: Extract website from Google Maps
    website_url = extract_website_from_google_maps(google_maps_url)
    
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None
    
    print(f"\nüìç Main Website: {website_url}\n")
    
    # Step 2: Crawl website to discover all pages
    all_pages = crawl_website(website_url)
    
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None
    
    print(f"\nüìÑ Discovered {len(all_pages)} total pages")
    
    # Step 3: Use LLM to select most relevant pages
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None
    
    # Step 4: Extract content from selected pages
    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        content = extract_page_content(page_url)
        if content:
            page_contents[page_url] = content
        time.sleep(RATE_LIMIT_DELAY)
    
    print(f"‚úÖ Extracted content from {len(page_contents)} pages")
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None
    
    # Step 5: Use LLM to extract consolidated business data
    extracted_data = extract_business_data_with_llm(page_contents, website_url)
    
    # Extract business name for filename
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        # Clean business name for filename
        business_name = re.sub(r'[^\w\s-]', '', business_name)
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()
    
    print("\n" + "=" * 80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("=" * 80)
    
    return extracted_data, business_name

# ============================================================================
# SECTION 9: DATA SAVING AND DISPLAY
# ============================================================================

"""
Save and display extraction results
"""

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    """
    Save extraction results to JSON file with business name
    
    Args:
        extracted_data (dict): Extracted business data
        business_name (str): Business name for filename
        all_pages_count (int): Total pages discovered
        selected_pages_count (int): Pages analyzed
    
    Returns:
        str: Output filename
    """
    # Create filename based on business name
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent page selection",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    """Display a formatted summary of extracted data"""
    print("\n" + "=" * 80)
    print("üìä EXTRACTION SUMMARY")
    print("=" * 80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            # Extract platform name
            platform = "Unknown"
            if 'facebook.com' in link:
                platform = "Facebook"
            elif 'instagram.com' in link:
                platform = "Instagram"
            elif 'linkedin.com' in link:
                platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link:
                platform = "Twitter/X"
            elif 'youtube.com' in link:
                platform = "YouTube"
            elif 'tiktok.com' in link:
                platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:")
    print(f"   {summary}")
    
    print("\n" + "=" * 80)

# ============================================================================
# SECTION 10: EXECUTION
# ============================================================================

"""
Run the complete scraping pipeline
"""

# INPUT: Paste your Google Maps URL here
GOOGLE_MAPS_URL = "https://maps.app.goo.gl/Tvdq57DwjeCz1w4V6"

# Execute the pipeline
if __name__ == "__main__":
    # Validate API key
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
        print("In Google Colab: Use Secrets (key icon) to add OPENAI_API_KEY")
        print("Locally: Set environment variable or update the code")
    else:
        # Store initial counts
        all_pages_discovered = 0
        pages_analyzed = 0
        
        # Run the scraper
        extracted_data, business_name = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            # Save results with business name in filename
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                TOP_PAGES_TO_ANALYZE
            )
            
            # Display summary
            display_summary(extracted_data)
            
            # Download file in Colab
            try:
                from google.colab import files
                files.download(filename)
                print(f"\n‚¨áÔ∏è Downloading: {filename}")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Please check the Google Maps URL and try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚úÖ Found website: http://www.tasteoftexas.com/

üìç Main Website: http://www.tasteoftexas.com/

üï∑Ô∏è Starting website crawl from: http://www.tasteoftexas.com/
  ‚úì Discovered [1/30]: http://www.tasteoftexas.com/
  ‚úì Discovered [2/30]: http://www.tasteoftexas.com/visit-us/
  ‚úì Discovered [3/30]: http://www.tasteoftexas.com/compare
  ‚úì Discovered [4/30]: http://www.tasteoftexas.com/cart.php
  ‚úì Discovered [5/30]: https://www.tasteoftexas.com/
  ‚úì Discovered [6/30]: https://www.tasteoftexas.com/menu/
  ‚úì Discovered [7/30]: https://www.tasteoftexas.com/wine-comparison/
  ‚úì Discovered [8/30]: https://www.tasteoftexas.com/visit-us/
  ‚úì Discovered [9/30]: https://www.tasteoftexas.com/private-events/
  ‚úì Discovered [10/30]: https://www.tasteoftexas.com/faqs/
  ‚úì Discovered [11/30]: https://www.tasteoftexas.com/to-go/
  ‚úì Discovere

In [23]:
# üîç Intelligent Web Scraper: Google Maps + LLM-Powered Page Selection
# FIXED VERSION - Improved LLM extraction and fallback methods

import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from tqdm import tqdm
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

import openai

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ============================================================================
# CONFIGURATION
# ============================================================================

try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

openai.api_key = OPENAI_API_KEY

MAX_CRAWL_DEPTH = 2
MAX_PAGES = 30
TOP_PAGES_TO_ANALYZE = 15  # Optimized for better token management
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1

ua = UserAgent()
print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# GOOGLE MAPS URL PROCESSOR
# ============================================================================

def setup_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            time.sleep(3)
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# WEBSITE CRAWLER
# ============================================================================

def is_valid_url(url, base_domain):
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        if parsed.netloc != base_parsed.netloc:
            return False
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url.rstrip('/'), 0)])
    pages = []
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        current_url = current_url.rstrip('/')
        if current_url in visited or depth > max_depth:
            continue
        visited.add(current_url)
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            if response.status_code != 200:
                continue
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0].rstrip('/')
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            time.sleep(RATE_LIMIT_DELAY)
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# LLM-POWERED PAGE SELECTION
# ============================================================================

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your job is to identify the most relevant pages that contain business information like company details, contact info, services, and about information."""
    
    user_prompt = f"""Analyze these {len(page_urls)} URLs and select the top {top_n} most relevant pages for extracting company information (like contact details, about us, services, etc.).

URLs:
{url_list}

Return ONLY a valid JSON array of numbers (1-indexed positions) like: [1, 3, 5, 7, 9]
Do not include any other text or explanation."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM raw response: {content[:200]}")
        
        if '```' in content:
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        selected_indices = json.loads(content)
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        return selected_urls[:top_n]
        
    except Exception as e:
        print(f"‚ö†Ô∏è LLM selection error: {e}")
        print("üîÑ Using heuristic fallback...")
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company', 'catering']
        scored_pages = []
        for url in page_urls:
            url_lower = url.lower()
            score = sum(2 for keyword in priority_keywords if keyword in url_lower)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        print(f"‚úÖ Heuristic selected {len(selected)} pages")
        return selected

# ============================================================================
# CONTENT EXTRACTOR - IMPROVED
# ============================================================================

def extract_page_content(url):
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        if response.status_code != 200:
            return ""
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove unwanted elements
        for script in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
            script.decompose()
        
        # Extract text with better formatting
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return ""

# ============================================================================
# IMPROVED LLM EXTRACTION WITH BETTER ERROR HANDLING
# ============================================================================

def extract_business_data_with_llm(page_contents, main_website_url):
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    # Prepare content with smart truncation
    combined_content = ""
    max_chars_per_page = 6000  # Increased slightly for better context
    
    for url, content in page_contents.items():
        truncated_content = content[:max_chars_per_page] if len(content) > max_chars_per_page else content
        combined_content += f"\n\n=== PAGE: {url} ===\n{truncated_content}\n"
    
    # Limit total content to stay within token limits
    max_total_chars = 40000
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated for length]"
    
    print(f"üìä Total content length: {len(combined_content)} characters")

    system_prompt = """You are an expert business data extraction assistant. Extract comprehensive business information from website content and return it as valid JSON. Be thorough in finding all contact details, emails, phone numbers, and social media links."""
    
    user_prompt = f"""Extract ALL business information from these web pages for: {main_website_url}

Website Content:
{combined_content}

Return ONLY a valid JSON object (no markdown, no explanations) with this structure:
{{
  "company_name": "full company name found on the website",
  "company_main_url": "{main_website_url}",
  "emails": ["list all unique email addresses found"],
  "contact_numbers": ["list all phone numbers with country codes if available"],
  "social_media_links": ["list all social media URLs - Facebook, Instagram, LinkedIn, Twitter, YouTube, etc."],
  "summary": "Write a comprehensive 5-10 line summary describing what the company does, their services/products, target audience, and unique value proposition based on ALL the content analyzed"
}}

Important:
- Find ALL emails and phone numbers across all pages
- Include country codes in phone numbers when visible
- Extract complete social media URLs
- Write a detailed, informative summary that captures the essence of the business
- Use empty arrays [] if nothing found, not null"""

    try:
        print("üîÑ Calling OpenAI API...")
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=3000  # Increased for more detailed extraction
        )
        
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM response received ({len(content)} chars)")
        
        # Enhanced JSON extraction
        json_str = content
        
        # Remove markdown code blocks
        if '```json' in json_str:
            json_str = json_str.split('```json')[1].split('```')[0].strip()
        elif '```' in json_str:
            json_str = json_str.split('```')[1].split('```')[0].strip()
        
        # Try to find JSON object if surrounded by text
        if not json_str.startswith('{'):
            match = re.search(r'\{.*\}', json_str, re.DOTALL)
            if match:
                json_str = match.group(0)
        
        # Parse JSON
        extracted_data = json.loads(json_str)
        
        # Validate and clean data
        extracted_data = validate_and_clean_data(extracted_data, main_website_url)
        
        print("‚úÖ Data extraction successful via LLM!")
        return extracted_data
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error: {e}")
        print(f"üîç Attempting to parse response: {content[:300]}...")
        return create_fallback_data(main_website_url, page_contents)
    except Exception as e:
        print(f"‚ùå LLM extraction error: {e}")
        return create_fallback_data(main_website_url, page_contents)

def validate_and_clean_data(data, main_url):
    """Validate and clean extracted data"""
    cleaned = {
        "company_name": data.get("company_name") or urlparse(main_url).netloc.replace('www.', '').split('.')[0].title(),
        "company_main_url": main_url,
        "emails": [],
        "contact_numbers": [],
        "social_media_links": [],
        "summary": data.get("summary") or "No summary available"
    }
    
    # Clean emails
    if data.get("emails") and isinstance(data["emails"], list):
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        cleaned["emails"] = [e.strip() for e in data["emails"] if re.match(email_pattern, e.strip())]
    
    # Clean phone numbers
    if data.get("contact_numbers") and isinstance(data["contact_numbers"], list):
        cleaned["contact_numbers"] = [p.strip() for p in data["contact_numbers"] if p and len(str(p).strip()) > 5]
    
    # Clean social media links
    if data.get("social_media_links") and isinstance(data["social_media_links"], list):
        social_domains = ['facebook.com', 'instagram.com', 'linkedin.com', 'twitter.com', 'x.com', 
                         'youtube.com', 'tiktok.com', 'pinterest.com']
        cleaned["social_media_links"] = [
            s.strip() for s in data["social_media_links"] 
            if s and any(domain in s.lower() for domain in social_domains)
        ]
    
    return cleaned

# ============================================================================
# IMPROVED FALLBACK DATA EXTRACTION
# ============================================================================

def create_fallback_data(main_website_url, page_contents):
    """Enhanced fallback extraction with better regex patterns"""
    print("üîÑ Using enhanced fallback extraction...")
    
    all_text = " ".join(page_contents.values())
    
    # Extract emails with better pattern
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    emails = list(set(re.findall(email_pattern, all_text)))
    emails = [e for e in emails if not e.endswith(('.png', '.jpg', '.gif'))]
    
    # Extract phone numbers with improved patterns
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}',  # International
        r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',  # US format
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',  # Simple format
    ]
    
    phones = set()
    for pattern in phone_patterns:
        found = re.findall(pattern, all_text)
        for phone in found:
            # Clean up phone number
            cleaned = re.sub(r'[^\d+()-]', '', phone)
            if len(re.sub(r'[^\d]', '', cleaned)) >= 10:  # At least 10 digits
                phones.add(phone.strip())
    
    # Extract social media links
    social_patterns = {
        'facebook.com': r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._-]+',
        'instagram.com': r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._-]+',
        'linkedin.com': r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._-]+',
        'twitter.com': r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._-]+',
        'youtube.com': r'https?://(?:www\.)?youtube\.com/(?:c|channel|user)/[a-zA-Z0-9._-]+',
        'tiktok.com': r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._-]+'
    }
    
    socials = []
    for platform, pattern in social_patterns.items():
        found = re.findall(pattern, all_text, re.IGNORECASE)
        socials.extend(found)
    socials = list(set(socials))
    
    # Generate better company name
    company_name = urlparse(main_website_url).netloc.replace('www.', '').split('.')[0]
    company_name = ' '.join(word.capitalize() for word in re.split(r'[-_]', company_name))
    
    return {
        "company_name": company_name,
        "company_main_url": main_website_url,
        "emails": sorted(list(set(emails)))[:15],
        "contact_numbers": sorted(list(phones))[:15],
        "social_media_links": sorted(socials),
        "summary": "Business information extracted using automated fallback method. Manual verification recommended for accuracy.",
        "extraction_method": "enhanced_regex_fallback"
    }

# ============================================================================
# MAIN ORCHESTRATION
# ============================================================================

def scrape_business_data(google_maps_url):
    print("="*80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("="*80)
    
    website_url = extract_website_from_google_maps(google_maps_url)
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None, 0, 0

    parsed = urlparse(website_url)
    website_url = f"{parsed.scheme}://{parsed.netloc}/"
    print(f"\nüìç Main Website: {website_url}\n")
    
    all_pages = crawl_website(website_url)
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None, 0, 0
    
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None, len(all_pages), 0

    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        content = extract_page_content(page_url)
        if content:
            page_contents[page_url] = content
            print(f"      ‚úì Extracted {len(content)} characters")
        time.sleep(RATE_LIMIT_DELAY)
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None, len(all_pages), len(selected_pages)

    extracted_data = extract_business_data_with_llm(page_contents, website_url)
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        business_name = re.sub(r'[^\w\s-]', '', str(business_name))
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()

    print("\n" + "="*80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("="*80)
    
    return extracted_data, business_name, len(all_pages), len(page_contents)

# ============================================================================
# SAVE AND DISPLAY RESULTS
# ============================================================================

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent page selection",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    print("\n" + "="*80)
    print("üìä EXTRACTION SUMMARY")
    print("="*80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            platform = "Unknown"
            if 'facebook.com' in link: platform = "Facebook"
            elif 'instagram.com' in link: platform = "Instagram"
            elif 'linkedin.com' in link: platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link: platform = "Twitter/X"
            elif 'youtube.com' in link: platform = "YouTube"
            elif 'tiktok.com' in link: platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:\n   {summary}")
    
    if 'extraction_method' in extracted_data:
        print(f"\n‚öôÔ∏è Extraction Method: {extracted_data['extraction_method']}")
    
    print("\n" + "="*80)

# ============================================================================
# EXECUTION
# ============================================================================

GOOGLE_MAPS_URL = "https://maps.app.goo.gl/Tvdq57DwjeCz1w4V6"

if __name__ == "__main__":
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
    else:
        extracted_data, business_name, all_pages_discovered, pages_analyzed = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                pages_analyzed
            )
            display_summary(extracted_data)
            try:
                from google.colab import files
                files.download(filename)
                print("üì• File download started!")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Check the URL and API key, then try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚úÖ Found website: http://www.tasteoftexas.com/

üìç Main Website: http://www.tasteoftexas.com/

üï∑Ô∏è Starting website crawl from: http://www.tasteoftexas.com/
  ‚úì Discovered [1/30]: http://www.tasteoftexas.com
  ‚úì Discovered [2/30]: http://www.tasteoftexas.com/visit-us
  ‚úì Discovered [3/30]: http://www.tasteoftexas.com/compare
  ‚úì Discovered [4/30]: http://www.tasteoftexas.com/cart.php
  ‚úì Discovered [5/30]: https://www.tasteoftexas.com
  ‚úì Discovered [6/30]: https://www.tasteoftexas.com/menu
  ‚úì Discovered [7/30]: https://www.tasteoftexas.com/wine-comparison
  ‚úì Discovered [8/30]: https://www.tasteoftexas.com/visit-us
  ‚úì Discovered [9/30]: https://www.tasteoftexas.com/private-events
  ‚úì Discovered [10/30]: https://www.tasteoftexas.com/faqs
  ‚úì Discovered [11/30]: https://www.tasteoftexas.com/to-go
  ‚úì Discovered [12/30]

In [29]:
# üîç Intelligent Web Scraper: Google Maps + LLM-Powered Page Selection
# FIXED VERSION - Improved LLM extraction and fallback methods

import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from tqdm import tqdm
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

import openai

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ============================================================================
# CONFIGURATION
# ============================================================================

try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

openai.api_key = OPENAI_API_KEY

MAX_CRAWL_DEPTH = 2
MAX_PAGES = 15
TOP_PAGES_TO_ANALYZE = 10  # Optimized for better token management
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1

ua = UserAgent()
print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# GOOGLE MAPS URL PROCESSOR
# ============================================================================

def setup_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            time.sleep(3)
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# WEBSITE CRAWLER
# ============================================================================

def is_valid_url(url, base_domain):
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        if parsed.netloc != base_parsed.netloc:
            return False
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url.rstrip('/'), 0)])
    pages = []
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        current_url = current_url.rstrip('/')
        if current_url in visited or depth > max_depth:
            continue
        visited.add(current_url)
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            if response.status_code != 200:
                continue
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0].rstrip('/')
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            time.sleep(RATE_LIMIT_DELAY)
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# LLM-POWERED PAGE SELECTION
# ============================================================================

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your job is to identify the most relevant pages that contain business information like company details, contact info, services, and about information."""
    
    user_prompt = f"""Analyze these {len(page_urls)} URLs and select the top {top_n} most relevant pages for extracting company information (like contact details, about us, services, etc.).

URLs:
{url_list}

Return ONLY a valid JSON array of numbers (1-indexed positions) like: [1, 3, 5, 7, 9]
Do not include any other text or explanation."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM raw response: {content[:200]}")
        
        if '```' in content:
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        selected_indices = json.loads(content)
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        return selected_urls[:top_n]
        
    except Exception as e:
        print(f"‚ö†Ô∏è LLM selection error: {e}")
        print("üîÑ Using heuristic fallback...")
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company', 'catering']
        scored_pages = []
        for url in page_urls:
            url_lower = url.lower()
            score = sum(2 for keyword in priority_keywords if keyword in url_lower)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        print(f"‚úÖ Heuristic selected {len(selected)} pages")
        return selected

# ============================================================================
# CONTENT EXTRACTOR - IMPROVED
# ============================================================================

def extract_page_content(url):
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        if response.status_code != 200:
            return ""
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove unwanted elements
        for script in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
            script.decompose()
        
        # Extract text with better formatting
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return ""

# ============================================================================
# IMPROVED LLM EXTRACTION WITH BETTER ERROR HANDLING
# ============================================================================

def extract_business_data_with_llm(page_contents, main_website_url):
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    # Prepare content with smart truncation
    combined_content = ""
    max_chars_per_page = 8000  # Increased for better context
    
    for url, content in page_contents.items():
        truncated_content = content[:max_chars_per_page] if len(content) > max_chars_per_page else content
        combined_content += f"\n\n=== PAGE: {url} ===\n{truncated_content}\n"
    
    # Limit total content to stay within token limits (more generous)
    max_total_chars = 60000
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated for length]"
    
    print(f"üìä Total content length: {len(combined_content)} characters")

    system_prompt = """You are an expert business data extraction assistant. Extract comprehensive business information from website content and return ONLY valid JSON with no additional text, markdown formatting, or explanations."""
    
    user_prompt = f"""Analyze the following website content and extract ALL business information for: {main_website_url}

{combined_content}

Extract and return ONLY a JSON object with this EXACT structure (no markdown, no text before or after):
{{
  "company_name": "Full official company name",
  "company_main_url": "{main_website_url}",
  "emails": ["email1@domain.com", "email2@domain.com"],
  "contact_numbers": ["+1-234-567-8900", "234-567-8900"],
  "social_media_links": ["https://facebook.com/page", "https://instagram.com/profile"],
  "summary": "A comprehensive 5-10 line summary describing: what the company does, main services/products offered, target audience/market, unique selling points, company values or mission, and any notable achievements or specializations. Base this on ALL analyzed pages."
}}

CRITICAL INSTRUCTIONS:
1. Find ALL emails, phone numbers, and social media links across all pages
2. Look for social media links in footer, header, contact pages, and inline content
3. Include full URLs for social media (Facebook, Instagram, Twitter/X, LinkedIn, YouTube, TikTok, Pinterest)
4. Write a detailed, informative summary that truly captures what the business does
5. Use empty arrays [] for missing data, never null
6. Return ONLY the JSON object - no explanation, no markdown backticks, no preamble"""

    try:
        print("üîÑ Calling OpenAI API...")
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=4000,  # Increased significantly
            response_format={"type": "json_object"}  # Force JSON response
        )
        
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM response received ({len(content)} chars)")
        print(f"üìù First 500 chars of response: {content[:500]}")
        
        # Parse JSON directly (response_format ensures it's JSON)
        extracted_data = json.loads(content)
        
        # Validate and clean data
        extracted_data = validate_and_clean_data(extracted_data, main_website_url)
        
        # Additional check - if data looks empty, try fallback
        if (not extracted_data.get('emails') and 
            not extracted_data.get('contact_numbers') and 
            not extracted_data.get('social_media_links')):
            print("‚ö†Ô∏è LLM extraction returned empty data, trying fallback...")
            return create_fallback_data(main_website_url, page_contents)
        
        print("‚úÖ Data extraction successful via LLM!")
        return extracted_data
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error: {e}")
        print(f"üîç Raw response: {content[:500]}")
        return create_fallback_data(main_website_url, page_contents)
    except Exception as e:
        print(f"‚ùå LLM extraction error: {type(e).__name__}: {e}")
        import traceback
        print(f"üîç Traceback: {traceback.format_exc()}")
        return create_fallback_data(main_website_url, page_contents)

def validate_and_clean_data(data, main_url):
    """Validate and clean extracted data"""
    cleaned = {
        "company_name": data.get("company_name") or urlparse(main_url).netloc.replace('www.', '').split('.')[0].title(),
        "company_main_url": main_url,
        "emails": [],
        "contact_numbers": [],
        "social_media_links": [],
        "summary": data.get("summary") or "No summary available"
    }
    
    # Clean emails
    if data.get("emails") and isinstance(data["emails"], list):
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        cleaned["emails"] = [e.strip() for e in data["emails"] if re.match(email_pattern, e.strip())]
    
    # Clean phone numbers
    if data.get("contact_numbers") and isinstance(data["contact_numbers"], list):
        cleaned["contact_numbers"] = [p.strip() for p in data["contact_numbers"] if p and len(str(p).strip()) > 5]
    
    # Clean social media links
    if data.get("social_media_links") and isinstance(data["social_media_links"], list):
        social_domains = ['facebook.com', 'instagram.com', 'linkedin.com', 'twitter.com', 'x.com', 
                         'youtube.com', 'tiktok.com', 'pinterest.com']
        cleaned["social_media_links"] = [
            s.strip() for s in data["social_media_links"] 
            if s and any(domain in s.lower() for domain in social_domains)
        ]
    
    return cleaned

# ============================================================================
# IMPROVED FALLBACK DATA EXTRACTION
# ============================================================================

def create_fallback_data(main_website_url, page_contents):
    """Enhanced fallback extraction with better regex patterns"""
    print("üîÑ Using enhanced fallback extraction...")
    
    all_text = " ".join(page_contents.values())
    
    # Extract emails with better pattern
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    emails = list(set(re.findall(email_pattern, all_text)))
    emails = [e for e in emails if not e.endswith(('.png', '.jpg', '.gif'))]
    
    # Extract phone numbers with improved patterns
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}',  # International
        r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',  # US format
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',  # Simple format
    ]
    
    phones = set()
    for pattern in phone_patterns:
        found = re.findall(pattern, all_text)
        for phone in found:
            # Clean up phone number
            cleaned = re.sub(r'[^\d+()-]', '', phone)
            if len(re.sub(r'[^\d]', '', cleaned)) >= 10:  # At least 10 digits
                phones.add(phone.strip())
    
    # Extract social media links
    social_patterns = {
        'facebook.com': r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._-]+',
        'instagram.com': r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._-]+',
        'linkedin.com': r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._-]+',
        'twitter.com': r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._-]+',
        'youtube.com': r'https?://(?:www\.)?youtube\.com/(?:c|channel|user)/[a-zA-Z0-9._-]+',
        'tiktok.com': r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._-]+'
    }
    
    socials = []
    for platform, pattern in social_patterns.items():
        found = re.findall(pattern, all_text, re.IGNORECASE)
        socials.extend(found)
    socials = list(set(socials))
    
    # Generate better company name
    company_name = urlparse(main_website_url).netloc.replace('www.', '').split('.')[0]
    company_name = ' '.join(word.capitalize() for word in re.split(r'[-_]', company_name))
    
    return {
        "company_name": company_name,
        "company_main_url": main_website_url,
        "emails": sorted(list(set(emails)))[:15],
        "contact_numbers": sorted(list(phones))[:15],
        "social_media_links": sorted(socials),
        "summary": "Business information extracted using automated fallback method. Manual verification recommended for accuracy.",
        "extraction_method": "enhanced_regex_fallback"
    }

# ============================================================================
# MAIN ORCHESTRATION
# ============================================================================

def scrape_business_data(google_maps_url):
    print("="*80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("="*80)
    
    website_url = extract_website_from_google_maps(google_maps_url)
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None, 0, 0

    parsed = urlparse(website_url)
    website_url = f"{parsed.scheme}://{parsed.netloc}/"
    print(f"\nüìç Main Website: {website_url}\n")
    
    all_pages = crawl_website(website_url)
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None, 0, 0
    
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None, len(all_pages), 0

    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        content = extract_page_content(page_url)
        if content:
            page_contents[page_url] = content
            print(f"      ‚úì Extracted {len(content)} characters")
        time.sleep(RATE_LIMIT_DELAY)
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None, len(all_pages), len(selected_pages)

    extracted_data = extract_business_data_with_llm(page_contents, website_url)
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        business_name = re.sub(r'[^\w\s-]', '', str(business_name))
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()

    print("\n" + "="*80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("="*80)
    
    return extracted_data, business_name, len(all_pages), len(page_contents)

# ============================================================================
# SAVE AND DISPLAY RESULTS
# ============================================================================

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent page selection",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    print("\n" + "="*80)
    print("üìä EXTRACTION SUMMARY")
    print("="*80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            platform = "Unknown"
            if 'facebook.com' in link: platform = "Facebook"
            elif 'instagram.com' in link: platform = "Instagram"
            elif 'linkedin.com' in link: platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link: platform = "Twitter/X"
            elif 'youtube.com' in link: platform = "YouTube"
            elif 'tiktok.com' in link: platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:\n   {summary}")
    
    if 'extraction_method' in extracted_data:
        print(f"\n‚öôÔ∏è Extraction Method: {extracted_data['extraction_method']}")
    
    print("\n" + "="*80)

# ============================================================================
# EXECUTION
# ============================================================================

GOOGLE_MAPS_URL = "https://maps.app.goo.gl/Tvdq57DwjeCz1w4V6"

if __name__ == "__main__":
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
    else:
        extracted_data, business_name, all_pages_discovered, pages_analyzed = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                pages_analyzed
            )
            display_summary(extracted_data)
            try:
                from google.colab import files
                files.download(filename)
                print("üì• File download started!")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Check the URL and API key, then try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚úÖ Found website: http://www.tasteoftexas.com/

üìç Main Website: http://www.tasteoftexas.com/

üï∑Ô∏è Starting website crawl from: http://www.tasteoftexas.com/
  ‚úì Discovered [1/15]: http://www.tasteoftexas.com
  ‚úì Discovered [2/15]: http://www.tasteoftexas.com/visit-us
  ‚úì Discovered [3/15]: http://www.tasteoftexas.com/compare
  ‚úì Discovered [4/15]: http://www.tasteoftexas.com/cart.php
  ‚úì Discovered [5/15]: https://www.tasteoftexas.com
  ‚úì Discovered [6/15]: https://www.tasteoftexas.com/menu
  ‚úì Discovered [7/15]: https://www.tasteoftexas.com/wine-comparison
  ‚úì Discovered [8/15]: https://www.tasteoftexas.com/visit-us
  ‚úì Discovered [9/15]: https://www.tasteoftexas.com/private-events
  ‚úì Discovered [10/15]: https://www.tasteoftexas.com/faqs
  ‚úì Discovered [11/15]: https://www.tasteoftexas.com/to-go
  ‚úì Discovered [12/15]

In [34]:
# üîç Intelligent Web Scraper: Google Maps + LLM-Powered Page Selection
# COMPLETE WORKING VERSION - All syntax errors fixed

import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

import openai

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# ============================================================================
# CONFIGURATION
# ============================================================================

try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

openai.api_key = OPENAI_API_KEY

MAX_CRAWL_DEPTH = 2
MAX_PAGES = 30
TOP_PAGES_TO_ANALYZE = 10
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1

ua = UserAgent()
print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# GOOGLE MAPS URL PROCESSOR
# ============================================================================

def setup_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            time.sleep(3)
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# WEBSITE CRAWLER
# ============================================================================

def is_valid_url(url, base_domain):
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        if parsed.netloc != base_parsed.netloc:
            return False
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url.rstrip('/'), 0)])
    pages = []
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        current_url = current_url.rstrip('/')
        if current_url in visited or depth > max_depth:
            continue
        visited.add(current_url)
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            if response.status_code != 200:
                continue
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0].rstrip('/')
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            time.sleep(RATE_LIMIT_DELAY)
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# LLM-POWERED PAGE SELECTION
# ============================================================================

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your job is to identify the most relevant pages that contain business information like company details, contact info, services, and about information."""
    
    user_prompt = f"""Analyze these {len(page_urls)} URLs and select the top {top_n} most relevant pages for extracting company information (like contact details, about us, services, etc.).

URLs:
{url_list}

Return ONLY a valid JSON array of numbers (1-indexed positions) like: [1, 3, 5, 7, 9]
Do not include any other text or explanation."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM raw response: {content[:200]}")
        
        if '```' in content:
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        selected_indices = json.loads(content)
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        return selected_urls[:top_n]
        
    except Exception as e:
        print(f"‚ö†Ô∏è LLM selection error: {e}")
        print("üîÑ Using heuristic fallback...")
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company']
        scored_pages = []
        for url in page_urls:
            url_lower = url.lower()
            score = sum(2 for keyword in priority_keywords if keyword in url_lower)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        print(f"‚úÖ Heuristic selected {len(selected)} pages")
        return selected

# ============================================================================
# CONTENT EXTRACTOR
# ============================================================================

def extract_page_content(url):
    """Extract both text content and raw HTML for better social media detection"""
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        if response.status_code != 200:
            return "", ""
        
        html_content = response.text
        soup = BeautifulSoup(response.content, 'html.parser')
        
        for script in soup(['script', 'style', 'iframe', 'noscript']):
            script.decompose()
        
        text = soup.get_text(separator=' ', strip=True)
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text, html_content
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return "", ""

def extract_social_from_html(html_list):
    """Extract social media links directly from raw HTML"""
    if not html_list:
        return []
    
    all_html = " ".join(html_list)
    socials = set()
    
    patterns = [
        r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?youtube\.com/(?:c|channel|user|@)[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?pinterest\.com/[a-zA-Z0-9._/-]+',
    ]
    
    for pattern in patterns:
        found = re.findall(pattern, all_html, re.IGNORECASE)
        for link in found:
            link = link.rstrip('/')
            link = re.sub(r'["\'>].*$', '', link)
            link = re.sub(r'(\?.*|#.*)$', '', link)
            if len(link) > 20:
                socials.add(link)
    
    return list(socials)

# ============================================================================
# LLM EXTRACTION
# ============================================================================

def extract_business_data_with_llm(page_contents, main_website_url, all_html=None):
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    social_media_from_html = extract_social_from_html(all_html) if all_html else []
    print(f"üîó Found {len(social_media_from_html)} social media links in HTML")
    
    combined_content = ""
    max_chars_per_page = 8000
    
    for url, content in page_contents.items():
        truncated_content = content[:max_chars_per_page] if len(content) > max_chars_per_page else content
        combined_content += f"\n\n=== PAGE: {url} ===\n{truncated_content}\n"
    
    max_total_chars = 60000
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated for length]"
    
    print(f"üìä Total content length: {len(combined_content)} characters")

    social_hint = ""
    if social_media_from_html:
        social_hint = f"\n\nNOTE: The following social media links were found in the HTML: {', '.join(social_media_from_html[:5])}"

    system_prompt = """You are an expert business data extraction assistant. Extract comprehensive business information from website content and return ONLY valid JSON with no additional text, markdown formatting, or explanations."""
    
    user_prompt = f"""Analyze the following website content and extract ALL business information for: {main_website_url}

{combined_content}{social_hint}

Extract and return ONLY a JSON object with this EXACT structure (no markdown, no text before or after):
{{
  "company_name": "Full official company name",
  "company_main_url": "{main_website_url}",
  "emails": ["email1@domain.com", "email2@domain.com"],
  "contact_numbers": ["+12345678900", "2345678900"],
  "social_media_links": ["https://facebook.com/page", "https://instagram.com/profile"],
  "summary": "A comprehensive 5-10 line summary describing: what the company does, short introduction main services/products offered, Base this on ALL analyzed pages."
}}

CRITICAL INSTRUCTIONS:
1. Find ALL emails, phone numbers, and social media links across all pages
2. Look for social media links in footer, header, contact pages, and inline content
3. Include full URLs for social media (Facebook, Instagram, Twitter/X, LinkedIn, YouTube, TikTok, Pinterest)
4. Write a detailed, informative summary that truly captures what the business does
5. Use empty arrays [] for missing data, never null
6. Return ONLY the JSON object - no explanation, no markdown backticks, no preamble"""

    try:
        print("üîÑ Calling OpenAI API...")
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=4000,
            response_format={"type": "json_object"}
        )
        
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM response received ({len(content)} chars)")
        print(f"üìù First 500 chars of response: {content[:500]}")
        
        extracted_data = json.loads(content)
        
        if social_media_from_html:
            existing_socials = set(extracted_data.get('social_media_links', []))
            all_socials = existing_socials.union(set(social_media_from_html))
            extracted_data['social_media_links'] = list(all_socials)
        
        extracted_data = validate_and_clean_data(extracted_data, main_website_url)
        
        if (not extracted_data.get('emails') and 
            not extracted_data.get('contact_numbers') and 
            not extracted_data.get('social_media_links')):
            print("‚ö†Ô∏è LLM extraction returned empty data, trying fallback...")
            return create_fallback_data(main_website_url, page_contents, all_html)
        
        print("‚úÖ Data extraction successful via LLM!")
        return extracted_data
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error: {e}")
        print(f"üîç Raw response: {content[:500]}")
        return create_fallback_data(main_website_url, page_contents, all_html)
    except Exception as e:
        print(f"‚ùå LLM extraction error: {type(e).__name__}: {e}")
        import traceback
        print(f"üîç Traceback: {traceback.format_exc()}")
        return create_fallback_data(main_website_url, page_contents, all_html)

def validate_and_clean_data(data, main_url):
    """Validate and clean extracted data"""
    cleaned = {
        "company_name": data.get("company_name") or urlparse(main_url).netloc.replace('www.', '').split('.')[0].title(),
        "company_main_url": main_url,
        "emails": [],
        "contact_numbers": [],
        "social_media_links": [],
        "summary": data.get("summary") or "No summary available"
    }
    
    if data.get("emails") and isinstance(data["emails"], list):
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        cleaned["emails"] = [e.strip() for e in data["emails"] if re.match(email_pattern, e.strip())]
    
    if data.get("contact_numbers") and isinstance(data["contact_numbers"], list):
        cleaned["contact_numbers"] = [p.strip() for p in data["contact_numbers"] if p and len(str(p).strip()) > 5]
    
    if data.get("social_media_links") and isinstance(data["social_media_links"], list):
        social_domains = ['facebook.com', 'instagram.com', 'linkedin.com', 'twitter.com', 'x.com', 
                         'youtube.com', 'tiktok.com', 'pinterest.com']
        cleaned["social_media_links"] = [
            s.strip() for s in data["social_media_links"] 
            if s and any(domain in s.lower() for domain in social_domains)
        ]
    
    return cleaned

# ============================================================================
# FALLBACK EXTRACTION
# ============================================================================

def create_fallback_data(main_website_url, page_contents, all_html=None):
    """Enhanced fallback extraction with better regex patterns and LLM summary"""
    print("üîÑ Using enhanced fallback extraction...")
    
    all_text = " ".join(page_contents.values())
    social_from_html = extract_social_from_html(all_html) if all_html else []
    
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    emails = list(set(re.findall(email_pattern, all_text)))
    emails = [e for e in emails if not e.endswith(('.png', '.jpg', '.gif', '.svg'))]
    
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}',
        r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
    ]
    
    phones = set()
    for pattern in phone_patterns:
        found = re.findall(pattern, all_text)
        for phone in found:
            cleaned = re.sub(r'[^\d+()-]', '', phone)
            if len(re.sub(r'[^\d]', '', cleaned)) >= 10:
                phones.add(phone.strip())
    
    socials_from_text = set()
    social_patterns = [
        r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?youtube\.com/(?:c|channel|user|@)[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?pinterest\.com/[a-zA-Z0-9._/-]+',
    ]
    
    for pattern in social_patterns:
        found = re.findall(pattern, all_text, re.IGNORECASE)
        for link in found:
            link = link.rstrip('/')
            link = re.sub(r'(\?.*|#.*)$', '', link)
            socials_from_text.add(link)
    
    all_socials = set(social_from_html).union(socials_from_text)
    
    company_name = urlparse(main_website_url).netloc.replace('www.', '').split('.')[0]
    company_name = ' '.join(word.capitalize() for word in re.split(r'[-_]', company_name))
    
    summary = generate_fallback_summary(all_text, company_name, main_website_url)
    
    return {
        "company_name": company_name,
        "company_main_url": main_website_url,
        "emails": sorted(list(set(emails)))[:15],
        "contact_numbers": sorted(list(phones))[:15],
        "social_media_links": sorted(list(all_socials)),
        "summary": summary,
        "extraction_method": "enhanced_regex_fallback_with_llm_summary"
    }

def generate_fallback_summary(content, company_name, website_url):
    """Generate a summary using LLM even when structured extraction fails"""
    try:
        truncated = content[:15000] if len(content) > 15000 else content
        
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a business analyst. Write concise, informative summaries."},
                {"role": "user", "content": f"""Based on this website content for {company_name} ({website_url}), write a 5-10 line summary describing:
- What the company does
- Main services/products
- Target audience
- Unique aspects

Content:
{truncated}

Write only the summary, no preamble."""}
            ],
            temperature=0.3,
            max_tokens=500
        )
        
        summary = response.choices[0].message.content.strip()
        print("‚úÖ Generated summary using LLM")
        return summary
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not generate LLM summary: {e}")
        return "Business information extracted using automated method. Unable to generate detailed summary. Please visit the website for more information."

# ============================================================================
# MAIN ORCHESTRATION
# ============================================================================

def scrape_business_data(google_maps_url):
    print("="*80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("="*80)
    
    website_url = extract_website_from_google_maps(google_maps_url)
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None, 0, 0

    parsed = urlparse(website_url)
    website_url = f"{parsed.scheme}://{parsed.netloc}/"
    print(f"\nüìç Main Website: {website_url}\n")
    
    all_pages = crawl_website(website_url)
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None, 0, 0
    
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None, len(all_pages), 0

    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    all_html = []
    
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        text_content, html_content = extract_page_content(page_url)
        if text_content:
            page_contents[page_url] = text_content
            all_html.append(html_content)
            print(f"      ‚úì Extracted {len(text_content)} characters")
        time.sleep(RATE_LIMIT_DELAY)
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None, len(all_pages), len(selected_pages)

    extracted_data = extract_business_data_with_llm(page_contents, website_url, all_html)
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        business_name = re.sub(r'[^\w\s-]', '', str(business_name))
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()

    print("\n" + "="*80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("="*80)
    
    return extracted_data, business_name, len(all_pages), len(page_contents)

# ============================================================================
# SAVE AND DISPLAY
# ============================================================================

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent page selection",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    print("\n" + "="*80)
    print("üìä EXTRACTION SUMMARY")
    print("="*80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            platform = "Unknown"
            if 'facebook.com' in link: platform = "Facebook"
            elif 'instagram.com' in link: platform = "Instagram"
            elif 'linkedin.com' in link: platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link: platform = "Twitter/X"
            elif 'youtube.com' in link: platform = "YouTube"
            elif 'tiktok.com' in link: platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:\n   {summary}")
    
    if 'extraction_method' in extracted_data:
        print(f"\n‚öôÔ∏è Extraction Method: {extracted_data['extraction_method']}")
    
    print("\n" + "="*80)


# ============================================================================
# EXECUTION
# ============================================================================

GOOGLE_MAPS_URL = "https://maps.app.goo.gl/xjfuyKPZsg8tbTP68"

if __name__ == "__main__":
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
    else:
        extracted_data, business_name, all_pages_discovered, pages_analyzed = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                pages_analyzed
            )
            display_summary(extracted_data)
            try:
                from google.colab import files
                files.download(filename)
                print("üì• File download started!")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Check the URL and API key, then try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚úÖ Found website: https://eatmila.com/?utm_source=google&utm_medium=organic&utm_campaign=gmb-listing

üìç Main Website: https://eatmila.com/

üï∑Ô∏è Starting website crawl from: https://eatmila.com/
  ‚úì Discovered [1/30]: https://eatmila.com
  ‚úì Discovered [2/30]: https://eatmila.com/products/classic-pork-xiao-long-bao
  ‚úì Discovered [3/30]: https://eatmila.com/products/potstickers
  ‚úì Discovered [4/30]: https://eatmila.com/products/chocolate-black-sesame-lava-dumplings
  ‚úì Discovered [5/30]: https://eatmila.com/products/noodles
  ‚úì Discovered [6/30]: https://eatmila.com/products/braised-beef-noodles
  ‚úì Discovered [7/30]: https://eatmila.com/collections/national
  ‚úì Discovered [8/30]: https://eatmila.com/products/mila-signature-bundle 
  ‚úì Discovered [9/30]: https://eatmila.com/products/hulu-sauce-jars
  ‚úì Discovered [10/30]: 