In [None]:
# GOOGLE MAPS SCRAPER
# Step 1: Install dependencies
print("📦 Installing dependencies...")
import os
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required Python packages
packages = ['selenium', 'beautifulsoup4', 'requests', 'pandas']
for package in packages:
    try:
        __import__(package)
        print(f"{package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)

# Step 2: Install Chrome and ChromeDriver for Colab
print("Setting up Chrome and ChromeDriver...")
os.system('apt-get update > /dev/null 2>&1')
os.system('apt install chromium-chromedriver > /dev/null 2>&1')
os.system('cp /usr/lib/chromium-browser/chromedriver /usr/bin')
print("Chrome setup complete!")

In [None]:
import requests
import csv
import re
import random
import time
from urllib.parse import urlparse, urljoin, quote_plus
from bs4 import BeautifulSoup
import json
import pandas as pd
import ssl
import urllib3
from datetime import datetime
import zipfile
import io
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import os
import sys
import argparse

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException


from google.colab import drive
drive.mount('/content/drive')

# Change the name of folder to where you want to save the results in Drive
SAVE_ROOT = "/content/drive/MyDrive/COLAB CLIENTS/LeadSheets"
os.makedirs(SAVE_ROOT, exist_ok=True)



def _running_in_colab() -> bool:
    return "google.colab" in sys.modules

def _parse_start_batch(default: int = 0) -> int:
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start-batch", type=int, default=None)
    args, _ = parser.parse_known_args()

    if args.start_batch is not None:
        return max(0, int(args.start_batch))

    env_val = os.getenv("START_BATCH")
    if env_val is not None:
        try:
            return max(0, int(env_val))
        except ValueError:
            pass

    if _running_in_colab():
        try:
            user_in = input(f"Last batch processed # (press Enter if want to start from begining: ").strip()
            if user_in != "":
                return max(0, int(user_in))
        except Exception:
            pass

    return max(0, int(default))

START_BATCH = _parse_start_batch(0)


# Disable SSL warnings for problematic certificates
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class ZipCodeBusinessScraper:
    def __init__(self):
        self.all_businesses = []
        self.processed_businesses = set()
        self.zip_results = {}
        self.zip_codes = []
        self.email_cache = {}  # Cache emails to prevent re-extraction
        self.website_cache = {}  # Cache website validation results
        self.intermediate_files = []  # Track intermediate CSV files
        self.csv_counter = 1  # Counter for intermediate CSV files
        self.setup_driver()

    def setup_driver(self):
        """Setup Chrome WebDriver"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

        # Performance optimizations
        chrome_options.add_argument("--disable-images")
        # chrome_options.add_argument("--disable-javascript")    #check this later
        chrome_options.add_argument("--disable-plugins")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--no-first-run")
        chrome_options.add_argument("--disable-default-apps")

        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

        self.driver.set_page_load_timeout(20)
        self.driver.implicitly_wait(3)

    def get_zip_codes_for_city(self, city_name, state_name=None):
        print(f"\nFinding ZIP codes for {city_name}...")

        zip_codes = set()

        # Method 1: ZIP code API
        zip_codes.update(self.get_zip_codes_from_api(city_name, state_name))

        # If we have enough from API, skip scraping
        if len(zip_codes) >= 10:
            final_zip_codes = sorted(list(zip_codes))
            print(f"Found {len(final_zip_codes)} ZIP codes from API")
            return final_zip_codes

        # Method 2: Use known ZIP code ranges (backup)
        if not zip_codes:
            zip_codes.update(self.get_known_zip_codes(city_name))

        # Method 3: Scrape only if necessary and we have few results
        if len(zip_codes) < 5:
            zip_codes.update(self.scrape_zip_codes_from_websites(city_name, state_name))

        final_zip_codes = sorted(list(zip_codes))
        print(f"Found {len(final_zip_codes)} ZIP codes for {city_name}: {final_zip_codes[:10]}{'...' if len(final_zip_codes) > 10 else ''}")

        return final_zip_codes

    def get_zip_codes_from_api(self, city_name, state_name):
        zip_codes = set()

        try:
            if state_name:
                state_abbr = self.get_state_abbreviation(state_name)
                url = f"http://api.zippopotam.us/us/{state_abbr}/{city_name.replace(' ', '%20')}"
            else:
                url = f"http://api.zippopotam.us/us/{city_name.replace(' ', '%20')}"

            response = requests.get(url, timeout=6)

            if response.status_code == 200:
                data = response.json()
                if 'places' in data:
                    for place in data['places']:
                        if 'post code' in place:
                            zip_codes.add(place['post code'])

        except Exception as e:
            print(f"API method failed: {e}")

        return zip_codes

    def scrape_zip_codes_from_websites(self, city_name, state_name):
        zip_codes = set()

        try:
            new_codes = self.scrape_unitedstateszipcodes_com(city_name, state_name)
            zip_codes.update(new_codes)
        except Exception as e:
            print(f"Website scraping failed: {e}")

        return zip_codes

    def scrape_unitedstateszipcodes_com(self, city_name, state_name):
        """Scrape from unitedstateszipcodes.org"""
        zip_codes = set()

        try:
            city_formatted = city_name.lower().replace(' ', '-')
            state_formatted = (state_name or '').lower().replace(' ', '-')

            if state_formatted:
                url = f"https://www.unitedstateszipcodes.org/{state_formatted}/{city_formatted}/"
            else:
                likely_states = self.get_likely_states(city_name)
                url = f"https://www.unitedstateszipcodes.org/{likely_states[0]}/{city_formatted}/"

            response = requests.get(url, timeout=8, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                zip_pattern = r'\b\d{5}(?:-\d{4})?\b'
                zip_matches = re.findall(zip_pattern, response.text)

                for zip_code in zip_matches:
                    if len(zip_code) >= 5:
                        zip_codes.add(zip_code[:5])

        except Exception as e:
            print(f"Error scraping unitedstateszipcodes.org: {e}")

        return zip_codes

    def get_known_zip_codes(self, city_name):
        """Enhanced known ZIP code ranges for major cities"""
        known_zip_ranges = {
            'chicago': list(range(60656, 60662)) + list(range(60701, 60730)),
            'los angeles': list(range(90001, 90099)) + list(range(91001, 91609)) +
                          list(range(90201, 90299)) + list(range(91701, 91799)),
            'new york': list(range(10001, 10299)) + list(range(11201, 11256)) +
                       list(range(10301, 10314)) + list(range(11001, 11099)),
            'houston': list(range(77001, 77099)) + list(range(77201, 77299)) +
                      list(range(77301, 77399)),
            'phoenix': list(range(85001, 85099)) + list(range(85201, 85299)),
            'philadelphia': list(range(19101, 19199)) + list(range(19001, 19099)),
            'san antonio': list(range(78201, 78299)) + list(range(78001, 78099)),
            'san diego': list(range(92001, 92199)) + list(range(91901, 91999)),
            'dallas': list(range(75201, 75299)) + list(range(75001, 75099)),
            'san jose': list(range(95101, 95199)) + list(range(94001, 94099)),
            'miami': list(range(33101, 33199)) + list(range(33001, 33099)),
            'atlanta': list(range(30301, 30399)) + list(range(30001, 30099)),
            'denver': list(range(80201, 80299)) + list(range(80001, 80099)),
            'seattle': list(range(98101, 98199)) + list(range(98001, 98099))
        }

        city_lower = city_name.lower()
        if city_lower in known_zip_ranges:
            zip_codes = [str(code) for code in known_zip_ranges[city_lower]]
            print(f"Using known ZIP codes for {city_name}: {len(zip_codes)} codes")
            return zip_codes

        return []

    def get_state_abbreviation(self, state_name):
        """Convert state name to abbreviation"""
        state_abbrevs = {
            'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR', 'california': 'CA',
            'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE', 'florida': 'FL', 'georgia': 'GA',
            'hawaii': 'HI', 'idaho': 'ID', 'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA',
            'kansas': 'KS', 'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
            'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
            'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV', 'new hampshire': 'NH',
            'new jersey': 'NJ', 'new mexico': 'NM', 'new york': 'NY', 'north carolina': 'NC',
            'north dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK', 'oregon': 'OR', 'pennsylvania': 'PA',
            'rhode island': 'RI', 'south carolina': 'SC', 'south dakota': 'SD', 'tennessee': 'TN',
            'texas': 'TX', 'utah': 'UT', 'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA',
            'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY'
        }

        return state_abbrevs.get(state_name.lower(), state_name.upper()[:2])

    def get_likely_states(self, city_name):
        """Get likely states for a city name"""
        city_states = {
            'chicago': ['illinois'],
            'los angeles': ['california'],
            'new york': ['new-york'],
            'houston': ['texas'],
            'phoenix': ['arizona'],
            'philadelphia': ['pennsylvania'],
            'san antonio': ['texas'],
            'san diego': ['california'],
            'dallas': ['texas'],
            'san jose': ['california'],
            'miami': ['florida'],
            'atlanta': ['georgia'],
            'boston': ['massachusetts'],
            'denver': ['colorado'],
            'seattle': ['washington']
        }

        return city_states.get(city_name.lower(), ['illinois', 'california', 'texas', 'new-york'])

    def create_google_maps_urls(self, search_phrase, zip_codes):
        """Create Google Maps URLs for each ZIP code"""
        print(f"Creating Google Maps URLs for '{search_phrase}' across {len(zip_codes)} ZIP codes...")

        urls = []
        base_url = "https://www.google.com/maps/search/"

        for zip_code in zip_codes:
            query = f"{search_phrase} near {zip_code}"
            encoded_query = quote_plus(query)
            full_url = f"{base_url}{encoded_query}/"

            urls.append({
                'url': full_url,
                'zip_code': zip_code,
                'query': query
            })

        print(f"Created {len(urls)} Google Maps URLs")
        return urls

    def scroll_to_load_businesses(self, max_scrolls=100):  # increase this later
        try:
            scrollable_div = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//div[@role="feed" and @aria-label and contains(@class, "m6QErb")]'
                ))
            )

            last_height = self.driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
            consecutive_same_height = 0

            for i in range(max_scrolls):
                self.driver.execute_script(
                    "arguments[0].scrollTop = arguments[0].scrollHeight",
                    scrollable_div
                )

                time.sleep(random.uniform(1.5, 2))  # Reduced from 2,3 seconds

                new_height = self.driver.execute_script("return arguments[0].scrollHeight", scrollable_div)

                if new_height == last_height:
                    consecutive_same_height += 1
                    if consecutive_same_height >= 3:
                        break
                else:
                    consecutive_same_height = 0
                    last_height = new_height

            total_loaded = len(self.driver.find_elements(By.CSS_SELECTOR, "div.Nv2PK"))
            print(f"Finished scrolling. Total businesses loaded: {total_loaded}")

        except Exception as e:
            print(f"Scrolling failed: {e}")

    def extract_business_data_from_maps(self, maps_url, zip_code, current_num, total_num):
        print(f"Loading Google Maps for ZIP {zip_code} ({current_num}/{total_num})...")

        try:
            self.driver.get(maps_url)
            time.sleep(4)  # Reduced from 8 seconds

            self.scroll_to_load_businesses()

            businesses = []

            wait = WebDriverWait(self.driver, 8)

            business_elements = []
            for selector in ['.Nv2PK', 'div[role="article"]', '.hfpxzc', 'a[data-value="Directions"]']:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if elements:
                        business_elements = elements
                        print(f"Found {len(elements)} business elements for ZIP {zip_code}")
                        break
                except:
                    continue

            if not business_elements:
                print(f"No business elements found for ZIP {zip_code}")
                return []

            unique_businesses = []
            processed_names_current_zip = set()

            max_businesses = min(len(business_elements), 120)

            for i, element in enumerate(business_elements[:max_businesses]):
                try:
                    print(f"Processing business {i+1}/{max_businesses} in ZIP {zip_code}")

                    self.driver.execute_script(
                        "arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
                        element
                    )
                    time.sleep(0.5)

                    try:
                        element.click()
                    except:
                        self.driver.execute_script("arguments[0].click();", element)

                    time.sleep(3)

                    business_data = self.extract_detailed_business_info()

                    if business_data and business_data.get('name'):
                        business_key_current = f"{business_data['name']}_{business_data['address']}"
                        business_key_global = f"{business_data['name']}_{business_data['address']}"

                        if (business_key_current not in processed_names_current_zip and
                            business_key_global not in self.processed_businesses):

                            processed_names_current_zip.add(business_key_current)
                            self.processed_businesses.add(business_key_global)

                            business_data['zip_code'] = zip_code
                            business_data['source_url'] = maps_url

                            unique_businesses.append(business_data)
                            print(f"Extracted: {business_data['name']}")
                        else:
                            print(f"Duplicate skipped: {business_data['name']}")

                except Exception as e:
                    print(f"Error processing business {i+1} in ZIP {zip_code}: {str(e)[:50]}...")
                    continue

            print(f"\nZIP {zip_code} complete! Extracted {len(unique_businesses)} unique businesses")
            return unique_businesses

        except Exception as e:
            print(f"Main extraction failed for ZIP {zip_code}: {e}")
            return []

    def extract_detailed_business_info(self):
        business_data = {
            'name': '',
            'address': '',
            'phone': '',
            'website': '',
            'email': ''
        }

        try:
            time.sleep(1.5)

            # Extract business name
            name_selectors = [
                'h1.DUwDvf.lfPIob',
                'h1[data-attrid="title"]',
                'h1.x3AX1-LfntMc-header-title-title',
                '.qrShPb .fontHeadlineLarge',
                'h1'
            ]

            for selector in name_selectors:
                try:
                    name_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
                    name_text = name_elem.text.strip()
                    if name_text and len(name_text) > 2 and 'search' not in name_text.lower():
                        business_data['name'] = name_text
                        break
                except:
                    continue

            # Extract address
            address_selectors = [
                '[data-item-id="address"] .Io6YTe',
                '.Io6YTe.fontBodyMedium',
                '[data-section-id="ad"] .Io6YTe',
                '.rogA2c .Io6YTe',
                '[data-attrid="kc:/location/location:address"] .Io6YTe'
            ]

            for selector in address_selectors:
                try:
                    addr_elem = self.driver.find_element(By.CSS_SELECTOR, selector)
                    addr_text = addr_elem.text.strip()
                    if re.search(r'\d+.*\w+.*\d{5}', addr_text) or any(x in addr_text.lower() for x in ['st', 'ave', 'rd', 'blvd']):
                        business_data['address'] = addr_text
                        break
                except:
                    continue

            # Extract phone number
            phone_selectors = [
                '[data-item-id*="phone"] .Io6YTe',
                'a[href^="tel:"]',
                '.rogA2c button[data-item-id*="phone"]',
                '[data-attrid*="phone"] .Io6YTe'
            ]

            for selector in phone_selectors:
                try:
                    phone_elems = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for elem in phone_elems:
                        phone_text = elem.text.strip()
                        phone_match = re.search(r'(\+?1?\s*\(?[0-9]{3}\)?[-.\s]*[0-9]{3}[-.\s]*[0-9]{4})', phone_text)
                        if phone_match:
                            business_data['phone'] = phone_match.group(1)
                            break
                    if business_data['phone']:
                        break
                except:
                    continue

            # Website extraction
            website_selectors = [
                '[role="main"] [data-item-id="authority"] a[href^="http"]',
                '[role="main"] a[data-value="Website"][href^="http"]',
                'a[data-value="Website"]',
                '.m6QErb [data-item-id="authority"] a[href^="http"]',
                'button[aria-label*="Website"]',
                '[role="main"] a[href^="http"]'
            ]

            website_found = False
            for selector in website_selectors:
                try:
                    website_elems = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    for elem in website_elems:
                        href = elem.get_attribute('href')
                        if href and self.is_valid_business_website_cached(href):
                            if self.is_website_for_current_business(href, business_data['name']):
                                business_data['website'] = href
                                website_found = True

                    if website_found:
                        break
                except:
                    continue

            # Extract email with caching
            if business_data['website'] and website_found:
                business_data['email'] = self.extract_email_from_website_cached(
                    business_data['website'],
                    business_data['name']
                )
            else:
                business_data['email'] = ''

            return business_data if business_data['name'] else None

        except Exception as e:
            print(f"Error in extract_detailed_business_info: {e}")
            return None

    def is_valid_business_website_cached(self, url):
        """Cached website validation to avoid repeated checks"""
        if url in self.website_cache:
            return self.website_cache[url]

        result = self.is_valid_business_website(url)
        self.website_cache[url] = result
        return result

    def is_website_for_current_business(self, website_url, business_name):
        """Verify if the website actually belongs to the current business"""
        try:
            if not website_url or not business_name:
                return False

            domain = urlparse(website_url).netloc.lower()
            business_keywords = re.findall(r'\b\w+\b', business_name.lower())

            common_words = {'law', 'office', 'offices', 'group', 'firm', 'pc', 'llc', 'inc', 'the', 'and', 'attorneys', 'lawyers', 'personal', 'injury', 'accident'}
            business_keywords = [word for word in business_keywords if word not in common_words and len(word) > 2]

            keyword_found = False
            for keyword in business_keywords[:3]:
                if keyword in domain or keyword in domain.replace('-', '').replace('_', ''):
                    keyword_found = True
                    break

            return keyword_found

        except:
            return False


    def is_valid_business_website(self, url):
        """Check if URL is a valid business website"""
        if not url:
            return False

        excluded_domains = [
            'google.com', 'gstatic.com', 'googleapis.com', 'googleusercontent.com',
            'facebook.com', 'instagram.com', 'twitter.com', 'linkedin.com',
            'youtube.com', 'yelp.com', 'foursquare.com', 'maps.google.com',
            't.co', 'bit.ly'
        ]

        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower()

            for excluded in excluded_domains:
                if excluded in domain:
                    return False

            if '.' not in domain or len(domain) < 4:
                return False

            return True

        except:
            return False

    def extract_email_from_website_cached(self, website_url, business_name=''):
        """Cached email extraction to avoid re-processing same websites"""
        cache_key = f"{website_url}_{business_name}"

        if cache_key in self.email_cache:
            return self.email_cache[cache_key]

        email = self.extract_email_from_website(website_url, business_name)
        self.email_cache[cache_key] = email
        return email

    def extract_email_from_website(self, website_url, business_name=''):
        try:
            if not website_url:
                return ''

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Connection': 'keep-alive',
            }

            # Check fewer pages, prioritize most likely ones
            pages_to_check = [
                '', '/contact', '/contact-us', '/about', '/about-us', '/team', '/attorneys', '/lawyers', '/staff', '/people', '/leadership', '/our-team', '/directory', '/bios', '/management',
                '/executives', '/office', '/locations', '/who-we-are', '/partners', '/members', '/employee-directory', '/key-people', '/bio'
            ]

            found_emails = set()

            for page_path in pages_to_check:
                try:
                    if page_path:
                        full_url = urljoin(website_url, page_path)
                    else:
                        full_url = website_url

                    response = requests.get(
                        full_url,
                        headers=headers,
                        timeout=10,
                        allow_redirects=True,
                        verify=False
                    )

                    if response.status_code != 200:
                        continue

                    content = response.text

                    # email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
                    email_patterns = [
                        # Standard email pattern
                        r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b',
                        # Mailto links
                        r'mailto:([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})',
                        # Email with spaces around @
                        r'\b[A-Za-z0-9._%+-]+\s*@\s*[A-Za-z0-9.-]+\s*\.\s*[A-Z|a-z]{2,7}\b',
                        # Quoted emails
                        r'["\']([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})["\']',
                        # Emails in JavaScript or data attributes
                        r'email["\']?\s*[:=]\s*["\']?([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})',
                        # Contact form action emails
                        r'action=["\'][^"\']*([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})'
                    ]

                    for pattern in email_patterns:
                        matches = re.findall(pattern, content, re.IGNORECASE)

                        for match in matches:
                            email = match.strip().replace(' ', '')
                            if email and self.is_valid_email(email, business_name):
                                found_emails.add(email.lower())

                    if found_emails:
                        break

                    time.sleep(1)

                except Exception:
                    continue

            if found_emails:
                email_list = list(found_emails)
                return email_list[0]

            return ''

        except Exception:
            return ''

    def is_valid_email(self, email, business_name=''):
        if not email or '@' not in email:
            return False

        email_lower = email.lower()

        excluded_patterns = [
            'noreply', 'no-reply', 'donotreply', 'do-not-reply',
            'bounce', 'mailer-daemon', 'postmaster', 'webmaster',
            'admin@', 'test@', 'example@', 'sample@',
            '@example.com', '@test.com', '@localhost'
        ]

        for pattern in excluded_patterns:
            if pattern in email_lower:
                return False

        email_regex = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}$'

        if not re.match(email_regex, email):
            return False

        if len(email) > 100:
            return False

        return True

    def save_intermediate_csv(self, businesses, search_phrase, city_name):
        if not businesses:
            return None

        # Generate filename for intermediate CSV
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        safe_phrase = re.sub(r'[^\w\s-]', '', search_phrase).strip().replace(' ', '_')
        safe_city = re.sub(r'[^\w\s-]', '', city_name).strip().replace(' ', '_')
        filename = os.path.join(self.save_dir, f"{safe_phrase}_{safe_city}_part_{self.csv_counter}.csv")

        # Save to CSV
        df = pd.DataFrame(businesses)
        column_order = ['name', 'address', 'phone', 'website', 'email']
        df = df.reindex(columns=column_order)
        df.to_csv(filename, index=False)

        print(f"\n{'='*60}")
        print(f"INTERMEDIATE CSV #{self.csv_counter} SAVED")
        print(f"{'='*60}")
        print(f"File: {filename}")
        print(f"Businesses in this file: {len(businesses)}")
        print(f"{'='*60}")

        # Track the file and increment counter
        self.intermediate_files.append(filename)
        self.csv_counter += 1

        return filename

    def scrape_zip_code_area(self, search_phrase, city_name, state_name=None, max_businesses_per_zip=120,
                             output_file=None, batch_size=10):

        start_time = datetime.now()

        print(f"Last batch (0): {START_BATCH}")

        folder_name = f"{search_phrase} in {city_name}".strip()
        self.save_dir = os.path.join(SAVE_ROOT, folder_name)
        os.makedirs(self.save_dir, exist_ok=True)


        # Step 1: Get all ZIP codes for the city
        self.zip_codes = self.get_zip_codes_for_city(city_name, state_name)

        if not self.zip_codes:
            print(f"ERROR: No ZIP codes found for {city_name}")
            return []

        # Step 2: Create Google Maps URLs for each ZIP code
        url_data = self.create_google_maps_urls(search_phrase, self.zip_codes)

        # Step 3: Process ZIP codes with intermediate CSV creation every <batch_size> ZIP codes
        total_zips = len(url_data)
        total_batches = (total_zips + batch_size - 1) // batch_size

        # ----------------------- start at specific batch -----------------------
        start_batch_index = min(max(START_BATCH, 0), max(total_batches - 1, 0))
        start_zip_index = start_batch_index * batch_size
        processed_count = start_zip_index  # continue global ZIP counter correctly
        # keep CSV part numbering aligned with every-<batch_size> ZIP cadence
        self.csv_counter = (processed_count // batch_size) + 1
        print(f"Resuming at batch {start_batch_index + 1}/{total_batches} (ZIP index {start_zip_index + 1})")
        # ---------------------------------------------------------------------------

        zip_group_businesses = []  # Businesses for current group of <batch_size> ZIP codes

        try:
            for i in range(start_zip_index, total_zips, batch_size):
                batch = url_data[i:i + batch_size]
                batch_num = (i // batch_size) + 1
                total_batches = (total_zips + batch_size - 1) // batch_size

                print(f"\n{'='*60}")
                print(f"PROCESSING BATCH {batch_num}/{total_batches}")
                print(f"ZIP codes {i+1}-{min(i+batch_size, total_zips)} of {total_zips}")
                print(f"{'='*60}")

                # Process each ZIP code in the batch
                for j, url_info in enumerate(batch):
                    zip_code = url_info['zip_code']
                    maps_url = url_info['url']
                    query = url_info['query']

                    processed_count += 1

                    print(f"\n[{processed_count}/{total_zips}] Processing ZIP {zip_code}")
                    print(f"Query: {query}")
                    print(f"URL: {maps_url[:80]}...")

                    try:
                        # Extract businesses for this ZIP code
                        businesses = self.extract_business_data_from_maps(
                            maps_url, zip_code, processed_count, total_zips
                        )

                        # Store results
                        self.zip_results[zip_code] = {
                            'zip_code': zip_code,
                            'url': maps_url,
                            'query': query,
                            'businesses_found': len(businesses),
                            'businesses': businesses
                        }

                        # Add to both combined results and current group
                        self.all_businesses.extend(businesses)
                        zip_group_businesses.extend(businesses)

                        print(f"ZIP {zip_code} complete: {len(businesses)} businesses")
                        print(f"Total businesses so far: {len(self.all_businesses)}")

                        # Check if we've processed <batch_size> ZIP codes and create intermediate CSV
                        if processed_count % batch_size == 0 and zip_group_businesses:
                            print(f"\n{'='*50}")
                            print(f"CREATING INTERMEDIATE CSV AFTER {processed_count} ZIP CODES")
                            print(f"{'='*50}")

                            # Clean and deduplicate current group
                            cleaned_group = self.clean_and_deduplicate(zip_group_businesses.copy())

                            # Save intermediate CSV
                            intermediate_file = self.save_intermediate_csv(
                                cleaned_group, search_phrase, city_name
                            )

                            print(f"Intermediate file saved: {intermediate_file}")
                            print(f"Businesses in this group: {len(cleaned_group)}")

                            # Reset group for next <batch_size> ZIP codes
                            zip_group_businesses = []
                            print(f"{'='*50}")

                        if processed_count < total_zips:
                            delay = random.uniform(2, 4)
                            print(f"Waiting {delay:.1f}s before next ZIP code...")
                            time.sleep(delay)

                    except Exception as e:
                        print(f"ERROR processing ZIP {zip_code}: {str(e)[:100]}...")
                        self.zip_results[zip_code] = {
                            'zip_code': zip_code,
                            'url': maps_url,
                            'query': query,
                            'businesses_found': 0,
                            'businesses': [],
                            'error': str(e)
                        }
                        continue

                # Shorter break between batches
                if batch_num < total_batches:
                    print(f"\nBatch {batch_num} complete. Taking 8-second break...")
                    time.sleep(8)

            # Save any remaining businesses in the last group (if not a multiple of batch_size)
            if zip_group_businesses:
                print(f"\n{'='*50}")
                print(f"CREATING FINAL INTERMEDIATE CSV FOR REMAINING ZIP CODES")
                print(f"{'='*50}")

                cleaned_group = self.clean_and_deduplicate(zip_group_businesses.copy())
                intermediate_file = self.save_intermediate_csv(
                    cleaned_group, search_phrase, city_name
                )
                print(f"Final intermediate file saved: {intermediate_file}")
                print(f"{'='*50}")

        except KeyboardInterrupt:
            print("\n\nScraping interrupted by user. Saving partial results...")
            # Save any remaining businesses if interrupted
            if zip_group_businesses:
                cleaned_group = self.clean_and_deduplicate(zip_group_businesses.copy())
                self.save_intermediate_csv(cleaned_group, search_phrase, city_name)

        except Exception as e:
            print(f"\nUnexpected error: {e}")

        finally:
            # Process and save final combined results
            if self.all_businesses:
                print(f"\n{'='*80}")
                print("PROCESSING FINAL COMBINED RESULTS")

                # Clean and deduplicate all businesses
                cleaned_businesses = self.clean_and_deduplicate(self.all_businesses)

                # Generate output filename if not provided
                if not output_file:
                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    safe_phrase = re.sub(r'[^\w\s-]', '', search_phrase).strip().replace(' ', '_')
                    safe_city = re.sub(r'[^\w\s-]', '', city_name).strip().replace(' ', '_')
                    output_file = f"{safe_phrase}_{safe_city}_FINAL_{timestamp}.csv"

                # Save final combined CSV
                df = self.save_results_to_csv(cleaned_businesses, output_file)

                # Print comprehensive summary including intermediate files
                self.print_comprehensive_summary_with_intermediates(
                    cleaned_businesses, start_time, search_phrase, city_name
                )

                return cleaned_businesses

            else:
                print("No businesses found across all ZIP codes")
                return []

            # Clean up
            if hasattr(self, 'driver'):
                self.driver.quit()

    def clean_and_deduplicate(self, businesses):
        """Clean data and remove duplicates across all ZIP codes"""
        print("Cleaning and deduplicating businesses...")

        cleaned_businesses = []
        seen_combinations = set()

        for business in businesses:
            name = business.get('name', '').strip()
            address = business.get('address', '').strip()

            if len(name) < 3:
                continue

            unique_key = f"{name.lower()}_{address.lower()}"

            if unique_key in seen_combinations:
                continue

            seen_combinations.add(unique_key)

            cleaned_business = {
                'name': name,
                'address': address,
                'phone': business.get('phone', '').strip(),
                'website': business.get('website', '').strip(),
                'email': business.get('email', '').strip(),
                'zip_code': business.get('zip_code', '').strip(),
                'source_url': business.get('source_url', '').strip()
            }

            cleaned_businesses.append(cleaned_business)

        print(f"Cleaned: {len(businesses)} → {len(cleaned_businesses)} (removed {len(businesses) - len(cleaned_businesses)} duplicates)")
        return cleaned_businesses

    def save_results_to_csv(self, businesses, filename):
        """Save results to CSV with proper formatting"""
        if not businesses:
            print("No business data to save")
            return None

        df = pd.DataFrame(businesses)

        column_order = ['name', 'address', 'phone', 'website', 'email']
        df = df.reindex(columns=column_order)

        filename = os.path.join(self.save_dir, os.path.basename(filename))
        df.to_csv(filename, index=False)
        print(f"Saved {len(businesses)} businesses to {filename}")

        # Show sample data
        print("\nSample data:")
        sample_df = df.head(3)[['name', 'address', 'phone', 'website', 'email']]
        for col in sample_df.columns:
            sample_df[col] = sample_df[col].astype(str).str[:40] + '...'
        print(sample_df.to_string(index=False))

        return df

    def print_comprehensive_summary_with_intermediates(self, businesses, start_time, search_phrase, city_name):
        end_time = datetime.now()
        duration = end_time - start_time

        print(f"\n{'='*80}")
        print("FINAL COMPREHENSIVE SCRAPING SUMMARY")
        print(f"{'='*80}")

        total_businesses = len(businesses)
        with_websites = len([b for b in businesses if b['website']])
        with_emails = len([b for b in businesses if b['email']])
        with_phones = len([b for b in businesses if b['phone']])
        with_addresses = len([b for b in businesses if b['address']])

        print(f"Search phrase: {search_phrase}")
        print(f"City: {city_name}")
        print(f"Total processing time: {duration}")
        print(f"ZIP codes processed: {len(self.zip_results)}")
        print(f"Total unique businesses: {total_businesses}")
        print(f"Businesses with websites: {with_websites} ({with_websites/max(1,total_businesses)*100:.1f}%)")
        print(f"Businesses with emails: {with_emails} ({with_emails/max(1,total_businesses)*100:.1f}%)")
        print(f"Businesses with phones: {with_phones} ({with_phones/max(1,total_businesses)*100:.1f}%)")
        print(f"Businesses with addresses: {with_addresses} ({with_addresses/max(1,total_businesses)*100:.1f}%)")

        # Intermediate files summary
        print(f"\n{'='*60}")
        print("INTERMEDIATE CSV FILES CREATED")
        print(f"{'='*60}")
        print(f"Total intermediate files: {len(self.intermediate_files)}")
        for i, filename in enumerate(self.intermediate_files, 1):
            print(f"  {i}. {filename}")
            if os.path.exists(filename):
                try:
                    df_temp = pd.read_csv(filename)
                    print(f"     → {len(df_temp)} businesses")
                except:
                    print(f"     → File exists")
        print(f"{'='*60}")

        # Performance metrics
        if len(self.zip_results) > 0:
            avg_time_per_zip = duration.total_seconds() / len(self.zip_results)
            avg_businesses_per_zip = total_businesses / len(self.zip_results)
            print(f"Average time per ZIP code: {avg_time_per_zip:.1f} seconds")
            print(f"Average businesses per ZIP: {avg_businesses_per_zip:.1f}")

        if with_websites > 0:
            email_success_rate = with_emails / with_websites * 100
            print(f"Email extraction success rate: {email_success_rate:.1f}%")

        # Cache efficiency metrics
        print(f"Email cache size: {len(self.email_cache)} entries")
        print(f"Website validation cache size: {len(self.website_cache)} entries")

        # ZIP code breakdown
        print(f"\nTOP 10 ZIP CODES BY BUSINESS COUNT:")
        zip_counts = {}
        for business in businesses:
            zip_code = business.get('zip_code', 'Unknown')
            zip_counts[zip_code] = zip_counts.get(zip_code, 0) + 1

        sorted_zips = sorted(zip_counts.items(), key=lambda x: x[1], reverse=True)
        for zip_code, count in sorted_zips[:10]:
            print(f"  {zip_code}: {count} businesses")

        print(f"\nQUALITY METRICS:")
        if total_businesses > 0:
            complete_profiles = len([b for b in businesses if all([b['name'], b['address'], b['phone'], b['website'], b['email']])])
            print(f"Complete profiles (all fields): {complete_profiles} ({complete_profiles/total_businesses*100:.1f}%)")

            useful_profiles = len([b for b in businesses if b['name'] and (b['phone'] or b['website'] or b['email'])])
            print(f"Useful profiles (name + contact): {useful_profiles} ({useful_profiles/total_businesses*100:.1f}%)")


        print(f"Intermediate CSV files: {len(self.intermediate_files)}")
        for i, filename in enumerate(self.intermediate_files, 1):
            print(f"  Part {i}: {filename}")
        print(f"Final combined CSV: Contains all {total_businesses} unique businesses")
        print(f"{'='*80}")


def scrape_by_zip_codes(search_phrase, city_name, state_name=None, max_businesses_per_zip=120,
                                  output_file=None, batch_size=5):

    scraper = ZipCodeBusinessScraper()
    return scraper.scrape_zip_code_area(
        search_phrase=search_phrase,
        city_name=city_name,
        state_name=state_name,
        max_businesses_per_zip=max_businesses_per_zip,
        output_file=output_file,
        batch_size=batch_size
    )

# Main execution
if __name__ == "__main__":

    search_phrase = input("Enter search phrase: ").strip().lower()
    city_name = input("Enter city name: ").strip().lower()
    state_name = input("Enter state name: ").strip().lower()

    batch_size = 5 #default is 5
    max_businesses_per_zip = 120

    output_file = f"{search_phrase}_{city_name}.csv".replace(" ", "_")

    total_businesses = scrape_by_zip_codes(
        search_phrase=search_phrase,
        city_name=city_name,
        state_name=state_name,
        max_businesses_per_zip=max_businesses_per_zip,
        batch_size=batch_size,
        output_file=output_file
    )

    print(f"\nData saved to: {output_file}")

    print(f"\nScraping Complete!")
    print(f"Found {len(total_businesses)} businesses!")
