In [1]:
# üîç Intelligent Web Scraper: Google Maps + LLM-Powered Page Selection
# COMPLETE WORKING VERSION - All syntax errors fixed

import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

import openai

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# ============================================================================
# CONFIGURATION
# ============================================================================

try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

openai.api_key = OPENAI_API_KEY

MAX_CRAWL_DEPTH = 2
MAX_PAGES = 30
TOP_PAGES_TO_ANALYZE = 10
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1

ua = UserAgent()
print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# GOOGLE MAPS URL PROCESSOR
# ============================================================================

def setup_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            time.sleep(3)
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# WEBSITE CRAWLER
# ============================================================================

def is_valid_url(url, base_domain):
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        if parsed.netloc != base_parsed.netloc:
            return False
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url.rstrip('/'), 0)])
    pages = []
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        current_url = current_url.rstrip('/')
        if current_url in visited or depth > max_depth:
            continue
        visited.add(current_url)
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            if response.status_code != 200:
                continue
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0].rstrip('/')
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            time.sleep(RATE_LIMIT_DELAY)
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# LLM-POWERED PAGE SELECTION
# ============================================================================

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your job is to identify the most relevant pages that contain business information like company details, contact info, services, and about information."""
    
    user_prompt = f"""Analyze these {len(page_urls)} URLs and select the top {top_n} most relevant pages for extracting company information (like contact details, about us, services, etc.).

URLs:
{url_list}

Return ONLY a valid JSON array of numbers (1-indexed positions) like: [1, 3, 5, 7, 9]
Do not include any other text or explanation."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM raw response: {content[:200]}")
        
        if '```' in content:
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        selected_indices = json.loads(content)
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        return selected_urls[:top_n]
        
    except Exception as e:
        print(f"‚ö†Ô∏è LLM selection error: {e}")
        print("üîÑ Using heuristic fallback...")
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company']
        scored_pages = []
        for url in page_urls:
            url_lower = url.lower()
            score = sum(2 for keyword in priority_keywords if keyword in url_lower)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        print(f"‚úÖ Heuristic selected {len(selected)} pages")
        return selected

# ============================================================================
# CONTENT EXTRACTOR
# ============================================================================

def extract_page_content(url):
    """Extract both text content and raw HTML for better social media detection"""
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        if response.status_code != 200:
            return "", ""
        
        html_content = response.text
        soup = BeautifulSoup(response.content, 'html.parser')
        
        for script in soup(['script', 'style', 'iframe', 'noscript']):
            script.decompose()
        
        text = soup.get_text(separator=' ', strip=True)
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text, html_content
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return "", ""

def extract_social_from_html(html_list):
    """Extract social media links directly from raw HTML"""
    if not html_list:
        return []
    
    all_html = " ".join(html_list)
    socials = set()
    
    patterns = [
        r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?youtube\.com/(?:c|channel|user|@)[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?pinterest\.com/[a-zA-Z0-9._/-]+',
    ]
    
    for pattern in patterns:
        found = re.findall(pattern, all_html, re.IGNORECASE)
        for link in found:
            link = link.rstrip('/')
            link = re.sub(r'["\'>].*$', '', link)
            link = re.sub(r'(\?.*|#.*)$', '', link)
            if len(link) > 20:
                socials.add(link)
    
    return list(socials)

# ============================================================================
# LLM EXTRACTION
# ============================================================================

def extract_business_data_with_llm(page_contents, main_website_url, all_html=None):
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    social_media_from_html = extract_social_from_html(all_html) if all_html else []
    print(f"üîó Found {len(social_media_from_html)} social media links in HTML")
    
    combined_content = ""
    max_chars_per_page = 8000
    
    for url, content in page_contents.items():
        truncated_content = content[:max_chars_per_page] if len(content) > max_chars_per_page else content
        combined_content += f"\n\n=== PAGE: {url} ===\n{truncated_content}\n"
    
    max_total_chars = 60000
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated for length]"
    
    print(f"üìä Total content length: {len(combined_content)} characters")

    social_hint = ""
    if social_media_from_html:
        social_hint = f"\n\nNOTE: The following social media links were found in the HTML: {', '.join(social_media_from_html[:5])}"

    system_prompt = """You are an expert business data extraction assistant. Extract comprehensive business information from website content and return ONLY valid JSON with no additional text, markdown formatting, or explanations."""
    
    user_prompt = f"""Analyze the following website content and extract ALL business information for: {main_website_url}

{combined_content}{social_hint}

Extract and return ONLY a JSON object with this EXACT structure (no markdown, no text before or after):
{{
  "company_name": "Full official company name",
  "company_main_url": "{main_website_url}",
  "emails": ["email1@domain.com", "email2@domain.com"],
  "contact_numbers": ["+12345678900", "2345678900"],
  "social_media_links": ["https://facebook.com/page", "https://instagram.com/profile"],
  "summary": "A comprehensive 5-10 line summary describing: what the company does, short introduction main services/products offered, Base this on ALL analyzed pages."
}}

CRITICAL INSTRUCTIONS:
1. Find ALL emails, phone numbers, and social media links across all pages
2. Look for social media links in footer, header, contact pages, and inline content
3. Include full URLs for social media (Facebook, Instagram, Twitter/X, LinkedIn, YouTube, TikTok, Pinterest)
4. Write a detailed, informative summary that truly captures what the business does
5. Use empty arrays [] for missing data, never null
6. Return ONLY the JSON object - no explanation, no markdown backticks, no preamble"""

    try:
        print("üîÑ Calling OpenAI API...")
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=4000,
            response_format={"type": "json_object"}
        )
        
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM response received ({len(content)} chars)")
        print(f"üìù First 500 chars of response: {content[:500]}")
        
        extracted_data = json.loads(content)
        
        if social_media_from_html:
            existing_socials = set(extracted_data.get('social_media_links', []))
            all_socials = existing_socials.union(set(social_media_from_html))
            extracted_data['social_media_links'] = list(all_socials)
        
        extracted_data = validate_and_clean_data(extracted_data, main_website_url)
        
        if (not extracted_data.get('emails') and 
            not extracted_data.get('contact_numbers') and 
            not extracted_data.get('social_media_links')):
            print("‚ö†Ô∏è LLM extraction returned empty data, trying fallback...")
            return create_fallback_data(main_website_url, page_contents, all_html)
        
        print("‚úÖ Data extraction successful via LLM!")
        return extracted_data
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error: {e}")
        print(f"üîç Raw response: {content[:500]}")
        return create_fallback_data(main_website_url, page_contents, all_html)
    except Exception as e:
        print(f"‚ùå LLM extraction error: {type(e).__name__}: {e}")
        import traceback
        print(f"üîç Traceback: {traceback.format_exc()}")
        return create_fallback_data(main_website_url, page_contents, all_html)

def validate_and_clean_data(data, main_url):
    """Validate and clean extracted data"""
    cleaned = {
        "company_name": data.get("company_name") or urlparse(main_url).netloc.replace('www.', '').split('.')[0].title(),
        "company_main_url": main_url,
        "emails": [],
        "contact_numbers": [],
        "social_media_links": [],
        "summary": data.get("summary") or "No summary available"
    }
    
    if data.get("emails") and isinstance(data["emails"], list):
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        cleaned["emails"] = [e.strip() for e in data["emails"] if re.match(email_pattern, e.strip())]
    
    if data.get("contact_numbers") and isinstance(data["contact_numbers"], list):
        cleaned["contact_numbers"] = [p.strip() for p in data["contact_numbers"] if p and len(str(p).strip()) > 5]
    
    if data.get("social_media_links") and isinstance(data["social_media_links"], list):
        social_domains = ['facebook.com', 'instagram.com', 'linkedin.com', 'twitter.com', 'x.com', 
                         'youtube.com', 'tiktok.com', 'pinterest.com']
        cleaned["social_media_links"] = [
            s.strip() for s in data["social_media_links"] 
            if s and any(domain in s.lower() for domain in social_domains)
        ]
    
    return cleaned

# ============================================================================
# FALLBACK EXTRACTION
# ============================================================================

def create_fallback_data(main_website_url, page_contents, all_html=None):
    """Enhanced fallback extraction with better regex patterns and LLM summary"""
    print("üîÑ Using enhanced fallback extraction...")
    
    all_text = " ".join(page_contents.values())
    social_from_html = extract_social_from_html(all_html) if all_html else []
    
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    emails = list(set(re.findall(email_pattern, all_text)))
    emails = [e for e in emails if not e.endswith(('.png', '.jpg', '.gif', '.svg'))]
    
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}',
        r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
    ]
    
    phones = set()
    for pattern in phone_patterns:
        found = re.findall(pattern, all_text)
        for phone in found:
            cleaned = re.sub(r'[^\d+()-]', '', phone)
            if len(re.sub(r'[^\d]', '', cleaned)) >= 10:
                phones.add(phone.strip())
    
    socials_from_text = set()
    social_patterns = [
        r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?youtube\.com/(?:c|channel|user|@)[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?pinterest\.com/[a-zA-Z0-9._/-]+',
    ]
    
    for pattern in social_patterns:
        found = re.findall(pattern, all_text, re.IGNORECASE)
        for link in found:
            link = link.rstrip('/')
            link = re.sub(r'(\?.*|#.*)$', '', link)
            socials_from_text.add(link)
    
    all_socials = set(social_from_html).union(socials_from_text)
    
    company_name = urlparse(main_website_url).netloc.replace('www.', '').split('.')[0]
    company_name = ' '.join(word.capitalize() for word in re.split(r'[-_]', company_name))
    
    summary = generate_fallback_summary(all_text, company_name, main_website_url)
    
    return {
        "company_name": company_name,
        "company_main_url": main_website_url,
        "emails": sorted(list(set(emails)))[:15],
        "contact_numbers": sorted(list(phones))[:15],
        "social_media_links": sorted(list(all_socials)),
        "summary": summary,
        "extraction_method": "enhanced_regex_fallback_with_llm_summary"
    }

def generate_fallback_summary(content, company_name, website_url):
    """Generate a summary using LLM even when structured extraction fails"""
    try:
        truncated = content[:15000] if len(content) > 15000 else content
        
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a business analyst. Write concise, informative summaries."},
                {"role": "user", "content": f"""Based on this website content for {company_name} ({website_url}), write a 5-10 line summary describing:
- What the company does
- Main services/products
- Target audience
- Unique aspects

Content:
{truncated}

Write only the summary, no preamble."""}
            ],
            temperature=0.3,
            max_tokens=500
        )
        
        summary = response.choices[0].message.content.strip()
        print("‚úÖ Generated summary using LLM")
        return summary
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not generate LLM summary: {e}")
        return "Business information extracted using automated method. Unable to generate detailed summary. Please visit the website for more information."

# ============================================================================
# MAIN ORCHESTRATION
# ============================================================================

def scrape_business_data(google_maps_url):
    print("="*80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("="*80)
    
    website_url = extract_website_from_google_maps(google_maps_url)
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None, 0, 0

    parsed = urlparse(website_url)
    website_url = f"{parsed.scheme}://{parsed.netloc}/"
    print(f"\nüìç Main Website: {website_url}\n")
    
    all_pages = crawl_website(website_url)
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None, 0, 0
    
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None, len(all_pages), 0

    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    all_html = []
    
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        text_content, html_content = extract_page_content(page_url)
        if text_content:
            page_contents[page_url] = text_content
            all_html.append(html_content)
            print(f"      ‚úì Extracted {len(text_content)} characters")
        time.sleep(RATE_LIMIT_DELAY)
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None, len(all_pages), len(selected_pages)

    extracted_data = extract_business_data_with_llm(page_contents, website_url, all_html)
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        business_name = re.sub(r'[^\w\s-]', '', str(business_name))
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()

    print("\n" + "="*80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("="*80)
    
    return extracted_data, business_name, len(all_pages), len(page_contents)

# ============================================================================
# SAVE AND DISPLAY
# ============================================================================

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent page selection",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    print("\n" + "="*80)
    print("üìä EXTRACTION SUMMARY")
    print("="*80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            platform = "Unknown"
            if 'facebook.com' in link: platform = "Facebook"
            elif 'instagram.com' in link: platform = "Instagram"
            elif 'linkedin.com' in link: platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link: platform = "Twitter/X"
            elif 'youtube.com' in link: platform = "YouTube"
            elif 'tiktok.com' in link: platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:\n   {summary}")
    
    if 'extraction_method' in extracted_data:
        print(f"\n‚öôÔ∏è Extraction Method: {extracted_data['extraction_method']}")
    
    print("\n" + "="*80)


# ============================================================================
# EXECUTION
# ============================================================================

GOOGLE_MAPS_URL = "https://maps.app.goo.gl/xjfuyKPZsg8tbTP68"

if __name__ == "__main__":
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
    else:
        extracted_data, business_name, all_pages_discovered, pages_analyzed = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                pages_analyzed
            )
            display_summary(extracted_data)
            try:
                from google.colab import files
                files.download(filename)
                print("üì• File download started!")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Check the URL and API key, then try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚úÖ Found website: https://eatmila.com/?utm_source=google&utm_medium=organic&utm_campaign=gmb-listing

üìç Main Website: https://eatmila.com/

üï∑Ô∏è Starting website crawl from: https://eatmila.com/
  ‚úì Discovered [1/30]: https://eatmila.com
  ‚úì Discovered [2/30]: https://eatmila.com/products/classic-pork-xiao-long-bao
  ‚úì Discovered [3/30]: https://eatmila.com/products/potstickers
  ‚úì Discovered [4/30]: https://eatmila.com/products/chocolate-black-sesame-lava-dumplings
  ‚úì Discovered [5/30]: https://eatmila.com/products/noodles
  ‚úì Discovered [6/30]: https://eatmila.com/products/braised-beef-noodles
  ‚úì Discovered [7/30]: https://eatmila.com/collections/national
  ‚úì Discovered [8/30]: https://eatmila.com/products/mila-signature-bundle 
  ‚úì Discovered [9/30]: https://eatmila.com/products/hulu-sauce-jars
  ‚úì Discovered [10/30]: 

In [9]:
# üîç Enhanced Intelligent Web Scraper - Fixed Version
# Improved LLM extraction + Better fallback handling

import os
import re
import json
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from collections import deque
import warnings
warnings.filterwarnings('ignore')

import openai

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# ============================================================================
# CONFIGURATION
# ============================================================================

try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
except:
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'your-api-key-here')

openai.api_key = OPENAI_API_KEY

MAX_CRAWL_DEPTH = 2
MAX_PAGES = 30
TOP_PAGES_TO_ANALYZE = 10
REQUEST_TIMEOUT = 10
RATE_LIMIT_DELAY = 1

ua = UserAgent()
print("‚úÖ Configuration loaded successfully!")

# ============================================================================
# GOOGLE MAPS URL PROCESSOR
# ============================================================================

def setup_selenium_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument(f'user-agent={ua.random}')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def extract_website_from_google_maps(maps_url, use_selenium=True):
    print(f"üîç Extracting website from Google Maps URL...")
    
    if use_selenium:
        driver = None
        try:
            driver = setup_selenium_driver()
            driver.get(maps_url)
            time.sleep(3)
            selectors = [
                "a[data-item-id='authority']",
                "a[aria-label*='Website']",
                "a[data-tooltip='Open website']",
                "button[data-item-id='authority']"
            ]
            
            for selector in selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href')
                        if href and 'google.com' not in href:
                            print(f"‚úÖ Found website: {href}")
                            return href
                except:
                    continue
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for link in soup.find_all('a', href=True):
                href = link['href']
                if href.startswith('http') and 'google.com' not in href and 'gstatic.com' not in href:
                    if not any(x in href for x in ['/maps/', '/search?', 'youtube.com', 'facebook.com']):
                        print(f"‚úÖ Found website via fallback: {href}")
                        return href
        except Exception as e:
            print(f"‚ùå Error with Selenium: {e}")
        finally:
            if driver:
                driver.quit()
    
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(maps_url, headers=headers, timeout=REQUEST_TIMEOUT)
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') and 'google.com' not in href:
                print(f"‚úÖ Found website: {href}")
                return href
    except Exception as e:
        print(f"‚ùå Error with requests: {e}")
    
    print("‚ö†Ô∏è Could not extract website URL")
    return None

# ============================================================================
# WEBSITE CRAWLER
# ============================================================================

def is_valid_url(url, base_domain):
    try:
        parsed = urlparse(url)
        base_parsed = urlparse(base_domain)
        if parsed.netloc != base_parsed.netloc:
            return False
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        skip_patterns = ['#', 'javascript:', 'mailto:', 'tel:', '/cdn-cgi/', '/wp-admin/']
        if any(pattern in url.lower() for pattern in skip_patterns):
            return False
        return True
    except:
        return False

def crawl_website(start_url, max_depth=MAX_CRAWL_DEPTH, max_pages=MAX_PAGES):
    print(f"üï∑Ô∏è Starting website crawl from: {start_url}")
    
    visited = set()
    to_visit = deque([(start_url.rstrip('/'), 0)])
    pages = []
    base_domain = f"{urlparse(start_url).scheme}://{urlparse(start_url).netloc}"
    
    while to_visit and len(pages) < max_pages:
        current_url, depth = to_visit.popleft()
        current_url = current_url.rstrip('/')
        if current_url in visited or depth > max_depth:
            continue
        visited.add(current_url)
        try:
            headers = {'User-Agent': ua.random}
            response = requests.get(current_url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
            if response.status_code != 200:
                continue
            pages.append(current_url)
            print(f"  ‚úì Discovered [{len(pages)}/{max_pages}]: {current_url}")
            soup = BeautifulSoup(response.content, 'html.parser')
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(current_url, link['href'])
                clean_url = absolute_url.split('#')[0].split('?')[0].rstrip('/')
                if is_valid_url(clean_url, base_domain) and clean_url not in visited:
                    to_visit.append((clean_url, depth + 1))
            time.sleep(RATE_LIMIT_DELAY)
        except Exception as e:
            print(f"  ‚úó Error crawling {current_url}: {str(e)[:50]}")
    
    print(f"‚úÖ Crawl complete! Discovered {len(pages)} pages")
    return pages

# ============================================================================
# LLM-POWERED PAGE SELECTION
# ============================================================================

def select_relevant_pages_with_llm(page_urls, top_n=TOP_PAGES_TO_ANALYZE):
    print(f"\nü§ñ Using LLM to select top {top_n} most relevant pages...")
    url_list = "\n".join([f"{i+1}. {url}" for i, url in enumerate(page_urls)])
    
    system_prompt = """You are an expert web analyst. Your job is to identify the most relevant pages that contain business information like company details, contact info, services, and about information."""
    
    user_prompt = f"""Analyze these {len(page_urls)} URLs and select the top {top_n} most relevant pages for extracting company information (like contact details, about us, services, etc.).

URLs:
{url_list}

Return ONLY a valid JSON array of numbers (1-indexed positions) like: [1, 3, 5, 7, 9]
Do not include any other text or explanation."""

    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.2,
            max_tokens=500
        )
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM raw response: {content[:200]}")
        
        if '```' in content:
            content = content.split('```')[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.strip()
        
        selected_indices = json.loads(content)
        selected_urls = [page_urls[i-1] for i in selected_indices if 0 < i <= len(page_urls)]
        
        print(f"‚úÖ LLM selected {len(selected_urls)} pages:")
        for i, url in enumerate(selected_urls, 1):
            print(f"   {i}. {url}")
        return selected_urls[:top_n]
        
    except Exception as e:
        print(f"‚ö†Ô∏è LLM selection error: {e}")
        print("üîÑ Using heuristic fallback...")
        priority_keywords = ['home', 'about', 'contact', 'service', 'product', 'portfolio', 'team', 'company']
        scored_pages = []
        for url in page_urls:
            url_lower = url.lower()
            score = sum(2 for keyword in priority_keywords if keyword in url_lower)
            score += (100 - len(url)) / 100
            scored_pages.append((score, url))
        scored_pages.sort(reverse=True, key=lambda x: x[0])
        selected = [url for _, url in scored_pages[:top_n]]
        print(f"‚úÖ Heuristic selected {len(selected)} pages")
        return selected

# ============================================================================
# CONTENT EXTRACTOR
# ============================================================================

def extract_page_content(url):
    """Extract both text content and raw HTML for better social media detection"""
    try:
        headers = {'User-Agent': ua.random}
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        if response.status_code != 200:
            return "", ""
        
        html_content = response.text
        soup = BeautifulSoup(response.content, 'html.parser')
        
        for script in soup(['script', 'style', 'iframe', 'noscript']):
            script.decompose()
        
        text = soup.get_text(separator=' ', strip=True)
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text, html_content
    except Exception as e:
        print(f"  ‚úó Error extracting content from {url}: {e}")
        return "", ""

def extract_social_from_html(html_list):
    """Extract social media links directly from raw HTML"""
    if not html_list:
        return []
    
    all_html = " ".join(html_list)
    socials = set()
    
    patterns = [
        r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?youtube\.com/(?:c|channel|user|@)[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?pinterest\.com/[a-zA-Z0-9._/-]+',
    ]
    
    for pattern in patterns:
        found = re.findall(pattern, all_html, re.IGNORECASE)
        for link in found:
            link = link.rstrip('/')
            link = re.sub(r'["\'>].*$', '', link)
            link = re.sub(r'(\?.*|#.*)$', '', link)
            if len(link) > 20:
                socials.add(link)
    
    return list(socials)

# ============================================================================
# IMPROVED LLM EXTRACTION WITH BETTER PROMPTS
# ============================================================================

def extract_business_data_with_llm(page_contents, main_website_url, all_html=None):
    print("\nü§ñ Using LLM to extract consolidated business data...")
    
    # Extract social media from HTML first
    social_media_from_html = extract_social_from_html(all_html) if all_html else []
    print(f"üîó Found {len(social_media_from_html)} social media links in HTML")
    
    # Also extract emails and phones from raw content as hints
    all_text = " ".join(page_contents.values())
    email_hints = list(set(re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', all_text)))
    phone_hints = list(set(re.findall(r'\+?\d[\d\s\-\(\)]{8,}\d', all_text)))
    
    print(f"üìß Found {len(email_hints)} email hints in text")
    print(f"üì± Found {len(phone_hints)} phone hints in text")
    
    # Prepare content with smart truncation
    combined_content = ""
    max_chars_per_page = 10000  # Increased for better context
    
    for url, content in page_contents.items():
        truncated_content = content[:max_chars_per_page] if len(content) > max_chars_per_page else content
        combined_content += f"\n\n=== PAGE: {url} ===\n{truncated_content}\n"
    
    max_total_chars = 80000  # Increased limit
    if len(combined_content) > max_total_chars:
        combined_content = combined_content[:max_total_chars] + "\n\n[Content truncated for length]"
    
    print(f"üìä Total content length: {len(combined_content)} characters")

    # Build hints for the LLM
    hints = ""
    if social_media_from_html:
        hints += f"\n\nSOCIAL MEDIA FOUND IN HTML: {', '.join(social_media_from_html[:5])}"
    if email_hints:
        hints += f"\n\nEMAIL HINTS FOUND: {', '.join(email_hints[:5])}"
    if phone_hints:
        hints += f"\n\nPHONE HINTS FOUND: {', '.join(phone_hints[:5])}"

    # IMPROVED SYSTEM PROMPT
    system_prompt = """You are an expert business intelligence analyst specializing in extracting comprehensive company information from websites. 

Your task is to:
1. Carefully analyze ALL provided content from multiple pages
2. Extract EVERY piece of contact information (emails, phones, social media)
3. Write a detailed, informative business summary (8-12 sentences minimum)
4. Return clean, valid JSON with no markdown formatting

Be thorough and precise. The summary should give readers a complete understanding of what the business does."""
    
    # IMPROVED USER PROMPT with better instructions
    user_prompt = f"""Analyze this complete website content for: {main_website_url}

{combined_content}{hints}

Extract comprehensive business information and return ONLY a JSON object (no markdown, no explanation):

{{
  "company_name": "Full official company name",
  "company_main_url": "{main_website_url}",
  "emails": ["all emails found"],
  "contact_numbers": ["all phone numbers with country codes if available"],
  "social_media_links": ["all social media URLs"],
  "summary": "DETAILED 8-12 sentence summary covering: (1) What the company does and its main business focus, (2) Primary products/services offered with specifics, (3) Target market or customer base, (4) Unique selling points or competitive advantages, (5) Company values or mission if mentioned, (6) Notable achievements or credentials, (7) Geographic service area if mentioned, (8) Any other relevant business details"
}}

CRITICAL REQUIREMENTS:
- EMAILS: Search for email addresses in contact pages, footers, about pages. Look for patterns like name@domain.com
- PHONES: Find ALL phone numbers including those in "Contact Us", "Call Us", footers, headers
- SOCIAL MEDIA: Include Facebook, Instagram, LinkedIn, Twitter/X, YouTube, TikTok, Pinterest links
- SUMMARY: Must be 8-12 sentences minimum. Be specific about what they do, not generic. Use actual details from the content.
- Use empty arrays [] for missing data, never null or omit fields
- Return ONLY valid JSON, no markdown backticks or preamble"""

    try:
        print("üîÑ Calling OpenAI API with improved prompts...")
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,  # Low temperature for consistency
            max_tokens=6000,  # Increased for detailed summaries
            response_format={"type": "json_object"}
        )
        
        content = response.choices[0].message.content.strip()
        print(f"üîç LLM response received ({len(content)} chars)")
        
        # Parse and validate
        extracted_data = json.loads(content)
        
        # Merge with HTML-extracted social media
        if social_media_from_html:
            existing_socials = set(extracted_data.get('social_media_links', []))
            all_socials = existing_socials.union(set(social_media_from_html))
            extracted_data['social_media_links'] = list(all_socials)
        
        # Validate and clean
        extracted_data = validate_and_clean_data(extracted_data, main_website_url)
        
        # Check if summary is too generic or short
        summary = extracted_data.get('summary', '')
        if len(summary) < 200 or 'unable to generate' in summary.lower():
            print("‚ö†Ô∏è Summary too short or generic, regenerating...")
            extracted_data['summary'] = generate_better_summary(all_text, extracted_data['company_name'])
        
        # If still no contact info after LLM, try regex fallback
        if not extracted_data.get('emails'):
            print("‚ö†Ô∏è No emails from LLM, using regex fallback...")
            extracted_data['emails'] = email_hints[:10]
        
        if not extracted_data.get('contact_numbers'):
            print("‚ö†Ô∏è No phones from LLM, using regex fallback...")
            extracted_data['contact_numbers'] = phone_hints[:10]
        
        print("‚úÖ Data extraction successful via LLM!")
        return extracted_data
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error: {e}")
        return create_fallback_data(main_website_url, page_contents, all_html)
    except Exception as e:
        print(f"‚ùå LLM extraction error: {type(e).__name__}: {e}")
        return create_fallback_data(main_website_url, page_contents, all_html)

def generate_better_summary(content, company_name):
    """Generate a more detailed summary with specific instructions"""
    try:
        truncated = content[:20000]  # More content for better summary
        
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a business analyst who writes detailed, informative company summaries. Never write generic summaries."},
                {"role": "user", "content": f"""Write a comprehensive 8-12 sentence business summary for {company_name} based on this website content:

{truncated}

Your summary MUST include:
1. What specific products/services they offer (be detailed, not generic)
2. Who their target customers are
3. What makes them unique or different
4. Their business approach or values
5. Any notable achievements, credentials, or experience mentioned
6. Geographic area they serve if mentioned
7. Specific details about their offerings (menu items, services, specialties, etc.)

Write in a professional but engaging tone. Be specific and use actual details from the content. Start directly with the summary."""}
            ],
            temperature=0.3,
            max_tokens=800
        )
        
        summary = response.choices[0].message.content.strip()
        print(f"‚úÖ Generated detailed summary ({len(summary)} chars)")
        return summary
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not generate detailed summary: {e}")
        return f"{company_name} is a business operating at the provided website. Due to technical limitations, a detailed summary could not be generated. Please visit their website directly for comprehensive information about their products, services, and business offerings."

def validate_and_clean_data(data, main_url):
    """Validate and clean extracted data"""
    cleaned = {
        "company_name": data.get("company_name") or urlparse(main_url).netloc.replace('www.', '').split('.')[0].title(),
        "company_main_url": main_url,
        "emails": [],
        "contact_numbers": [],
        "social_media_links": [],
        "summary": data.get("summary") or "No summary available"
    }
    
    # Validate emails
    if data.get("emails") and isinstance(data["emails"], list):
        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
        cleaned["emails"] = [e.strip() for e in data["emails"] if re.match(email_pattern, e.strip())]
    
    # Validate phone numbers
    if data.get("contact_numbers") and isinstance(data["contact_numbers"], list):
        cleaned["contact_numbers"] = [p.strip() for p in data["contact_numbers"] if p and len(str(p).strip()) > 5]
    
    # Validate social media
    if data.get("social_media_links") and isinstance(data["social_media_links"], list):
        social_domains = ['facebook.com', 'instagram.com', 'linkedin.com', 'twitter.com', 'x.com', 
                         'youtube.com', 'tiktok.com', 'pinterest.com']
        cleaned["social_media_links"] = [
            s.strip() for s in data["social_media_links"] 
            if s and any(domain in s.lower() for domain in social_domains)
        ]
    
    return cleaned

# ============================================================================
# IMPROVED FALLBACK EXTRACTION
# ============================================================================

def create_fallback_data(main_website_url, page_contents, all_html=None):
    """Enhanced fallback extraction with better patterns"""
    print("üîÑ Using enhanced fallback extraction...")
    
    all_text = " ".join(page_contents.values())
    social_from_html = extract_social_from_html(all_html) if all_html else []
    
    # Email extraction
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    emails = list(set(re.findall(email_pattern, all_text)))
    emails = [e for e in emails if not e.endswith(('.png', '.jpg', '.gif', '.svg'))]
    
    # Phone extraction with multiple patterns
    phone_patterns = [
        r'\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}',
        r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
        r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',
    ]
    
    phones = set()
    for pattern in phone_patterns:
        found = re.findall(pattern, all_text)
        for phone in found:
            cleaned = re.sub(r'[^\d+()-]', '', phone)
            if len(re.sub(r'[^\d]', '', cleaned)) >= 10:
                phones.add(phone.strip())
    
    # Social media extraction from text
    socials_from_text = set()
    social_patterns = [
        r'https?://(?:www\.)?facebook\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?instagram\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?linkedin\.com/(?:company|in)/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?(?:twitter|x)\.com/[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?youtube\.com/(?:c|channel|user|@)[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?tiktok\.com/@[a-zA-Z0-9._/-]+',
        r'https?://(?:www\.)?pinterest\.com/[a-zA-Z0-9._/-]+',
    ]
    
    for pattern in social_patterns:
        found = re.findall(pattern, all_text, re.IGNORECASE)
        for link in found:
            link = link.rstrip('/')
            link = re.sub(r'(\?.*|#.*)$', '', link)
            socials_from_text.add(link)
    
    all_socials = set(social_from_html).union(socials_from_text)
    
    # Company name
    company_name = urlparse(main_website_url).netloc.replace('www.', '').split('.')[0]
    company_name = ' '.join(word.capitalize() for word in re.split(r'[-_]', company_name))
    
    # Generate summary with LLM
    summary = generate_better_summary(all_text, company_name)
    
    return {
        "company_name": company_name,
        "company_main_url": main_website_url,
        "emails": sorted(list(set(emails)))[:15],
        "contact_numbers": sorted(list(phones))[:15],
        "social_media_links": sorted(list(all_socials)),
        "summary": summary,
        "extraction_method": "enhanced_regex_fallback_with_llm_summary"
    }

# ============================================================================
# MAIN ORCHESTRATION
# ============================================================================

def scrape_business_data(google_maps_url):
    print("="*80)
    print("üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION")
    print("="*80)
    
    website_url = extract_website_from_google_maps(google_maps_url)
    if not website_url:
        print("‚ùå Failed to extract website URL from Google Maps")
        return None, None, 0, 0

    parsed = urlparse(website_url)
    website_url = f"{parsed.scheme}://{parsed.netloc}/"
    print(f"\nüìç Main Website: {website_url}\n")
    
    all_pages = crawl_website(website_url)
    if not all_pages:
        print("‚ùå No pages found to scrape")
        return None, None, 0, 0
    
    selected_pages = select_relevant_pages_with_llm(all_pages, TOP_PAGES_TO_ANALYZE)
    if not selected_pages:
        print("‚ùå No pages selected for analysis")
        return None, None, len(all_pages), 0

    print(f"\nüì• Extracting content from {len(selected_pages)} selected pages...")
    page_contents = {}
    all_html = []
    
    for i, page_url in enumerate(selected_pages, 1):
        print(f"  [{i}/{len(selected_pages)}] Extracting: {page_url}")
        text_content, html_content = extract_page_content(page_url)
        if text_content:
            page_contents[page_url] = text_content
            all_html.append(html_content)
            print(f"      ‚úì Extracted {len(text_content)} characters")
        time.sleep(RATE_LIMIT_DELAY)
    
    if not page_contents:
        print("‚ùå No content extracted from any page")
        return None, None, len(all_pages), len(selected_pages)

    extracted_data = extract_business_data_with_llm(page_contents, website_url, all_html)
    business_name = extracted_data.get('company_name', 'unknown_business')
    if business_name:
        business_name = re.sub(r'[^\w\s-]', '', str(business_name))
        business_name = re.sub(r'[-\s]+', '_', business_name).lower()

    print("\n" + "="*80)
    print("‚úÖ EXTRACTION COMPLETE!")
    print("="*80)
    
    return extracted_data, business_name, len(all_pages), len(page_contents)

# ============================================================================
# SAVE AND DISPLAY
# ============================================================================

def save_results(extracted_data, business_name, all_pages_count, selected_pages_count):
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{business_name}_{timestamp}.json"
    output = {
        "business_data": extracted_data,
        "extraction_metadata": {
            "total_pages_discovered": all_pages_count,
            "pages_analyzed": selected_pages_count,
            "extraction_method": "LLM-powered intelligent extraction",
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "model_used": "gpt-4o-mini"
        }
    }
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nüíæ Results saved to: {filename}")
    return filename

def display_summary(extracted_data):
    print("\n" + "="*80)
    print("üìä EXTRACTION SUMMARY")
    print("="*80)
    
    print(f"\nüè¢ Company Name: {extracted_data.get('company_name', 'N/A')}")
    print(f"üåê Website: {extracted_data.get('company_main_url', 'N/A')}")
    
    emails = extracted_data.get('emails', [])
    print(f"\nüìß Emails ({len(emails)} found):")
    if emails:
        for email in emails[:5]:
            print(f"   ‚Ä¢ {email}")
        if len(emails) > 5:
            print(f"   ... and {len(emails) - 5} more")
    else:
        print("   None found")
    
    phones = extracted_data.get('contact_numbers', [])
    print(f"\nüì± Phone Numbers ({len(phones)} found):")
    if phones:
        for phone in phones[:5]:
            print(f"   ‚Ä¢ {phone}")
        if len(phones) > 5:
            print(f"   ... and {len(phones) - 5} more")
    else:
        print("   None found")
    
    socials = extracted_data.get('social_media_links', [])
    print(f"\nüîó Social Media ({len(socials)} links):")
    if socials:
        for link in socials:
            platform = "Unknown"
            if 'facebook.com' in link: platform = "Facebook"
            elif 'instagram.com' in link: platform = "Instagram"
            elif 'linkedin.com' in link: platform = "LinkedIn"
            elif 'twitter.com' in link or 'x.com' in link: platform = "Twitter/X"
            elif 'youtube.com' in link: platform = "YouTube"
            elif 'tiktok.com' in link: platform = "TikTok"
            print(f"   ‚Ä¢ {platform}: {link}")
    else:
        print("   None found")
    
    summary = extracted_data.get('summary', 'N/A')
    print(f"\nüìù Business Summary:")
    print(f"   {summary}")
    
    if 'extraction_method' in extracted_data:
        print(f"\n‚öôÔ∏è Extraction Method: {extracted_data['extraction_method']}")
    
    print("\n" + "="*80)


# ============================================================================
# EXECUTION
# ============================================================================

GOOGLE_MAPS_URL = "https://maps.app.goo.gl/docBXMhBUZ5qGZ3f9"

if __name__ == "__main__":
    if not OPENAI_API_KEY or OPENAI_API_KEY == "your-api-key-here":
        print("‚ùå ERROR: Please set your OPENAI_API_KEY!")
    else:
        extracted_data, business_name, all_pages_discovered, pages_analyzed = scrape_business_data(GOOGLE_MAPS_URL)
        
        if extracted_data and business_name:
            filename = save_results(
                extracted_data, 
                business_name,
                all_pages_discovered,
                pages_analyzed
            )
            display_summary(extracted_data)
            try:
                from google.colab import files
                files.download(filename)
                print("üì• File download started!")
            except:
                print(f"üìÅ File saved locally: {filename}")
        else:
            print("‚ùå No data extracted. Check the URL and API key, then try again.")

print("\n‚úÖ Script execution complete!")

‚úÖ Configuration loaded successfully!
üöÄ STARTING INTELLIGENT BUSINESS DATA EXTRACTION
üîç Extracting website from Google Maps URL...
‚úÖ Found website: http://thesintrahotel.com/

üìç Main Website: http://thesintrahotel.com/

üï∑Ô∏è Starting website crawl from: http://thesintrahotel.com/
  ‚úì Discovered [1/30]: http://thesintrahotel.com
  ‚úì Discovered [2/30]: http://thesintrahotel.com/about-online-hotel-booking-islamabad
  ‚úì Discovered [3/30]: http://thesintrahotel.com/executive-room-at-hotel-islamabad
  ‚úì Discovered [4/30]: http://thesintrahotel.com/twin-executive-room-at-hotel-islamabad
  ‚úì Discovered [5/30]: http://thesintrahotel.com/super-deluxe-room-at-hotel-islamabad
  ‚úì Discovered [6/30]: http://thesintrahotel.com/sintra-hotel-islamabad-pictures
  ‚úì Discovered [7/30]: http://thesintrahotel.com/room-decor-packages
  ‚úì Discovered [8/30]: http://thesintrahotel.com/places-to-visit-in-islamabad-and-fun-activities
  ‚úì Discovered [9/30]: http://thesintrahotel.co