# Web Scraping

## Imports

In [None]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep

## Initialize Webdriver

In [None]:
options = webdriver.ChromeOptions()
options.add_experimental_option('detach', True)
driver = webdriver.Chrome(options=options)

## Define Functions

### Input Validation

In [None]:
def validate_input_file(file_path):
    """Check if the input Excel file exists and can be read."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
    try:
        data = pd.read_excel(file_path)
    except Exception as e:
        raise ValueError(f"Error reading the Excel file: {e}")
    return data

def validate_url(url):
    """Check if the URL is valid."""
    if not url.startswith("http://") and not url.startswith("https://"):
        raise ValueError("Invalid URL. Please provide a valid HTTP or HTTPS URL.")

def validate_output_file(output_path):
    """Ensure the output file can be written."""
    try:
        with open(output_path, 'w') as f:
            pass
    except Exception as e:
        raise IOError(f"Cannot write to the output file '{output_path}': {e}")

### Selenium Driver Initialization and Scraping

In [None]:
def initialize_driver():
    try:
        options = webdriver.ChromeOptions()
        options.add_experimental_option('detach', True)
        driver = webdriver.Chrome(options=options)
        return driver
    except Exception as e:
        raise RuntimeError(f"Error initializing Selenium WebDriver: {e}")
def perform_scraping(driver, url):
    """Add error handling to the scraping logic."""
    try:
        driver.get(url)
    except Exception as e:
        raise RuntimeError(f"Error accessing the URL '{url}': {e}")

### Helper Functions

In [None]:
def clean_name(name):
    """Cleans the corporation name by removing unwanted characters."""
    return name.lower().replace(',', '').replace('.', '')

def fetch_agent_details(driver, name, cleaned_results, search_results, rows):
    """Fetch agent details, including name and address, and the corporation's status."""
    if name in cleaned_results:
        status = rows[cleaned_results.index(name)].text.split()[-1]
        element = search_results[cleaned_results.index(name)]
        driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth', block: 'center' });", element)
        element.click()
        try:
            agent_section = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.detailSection:nth-of-type(5)"))
            )
            agent_text = agent_section.text.split('\n')
            agent_name = agent_text[1] if len(agent_text) > 1 else "Agent Not Found"
            address = " ".join(agent_text[2:4]) if len(agent_text) > 3 else "Address Not Found"
            return agent_name, address, status
        except Exception as e:
            return "Agent Not Found", "Address Not Found", "INACT"
    return "Agent Not Found", "Address Not Found", "INACT"

def handle_name_variations(name):
    """Handle common variations in names."""
    if 'ents' in name:
        name = name.replace('ents', 'enterprise')
    if 'corp' in name:
        name = name.replace('corp', 'corporation')
    return name

## Setup

In [None]:
input_file = input("Enter the path to the Excel file: ")  # Example: 'realtor data.xlsx'
output_file = input("Enter the name for the output file: ")  # Example: 'Sunbiz Data.xlsx'
search_url = input("Enter the search URL: ")  # Example: 'https://search.sunbiz.org/...'

In [None]:
try:
    data = validate_input_file(input_file)
    validate_url(search_url)
    validate_output_file(output_file)
    print("Input and output files validated successfully.")
except Exception as e:
    print(f"Validation Error: {e}")
    exit()

In [None]:
try:
    driver = initialize_driver()
except Exception as e:
    print(f"Error initializing WebDriver: {e}")
    exit()

## Main Logic

In [None]:
agent_names = []
agent_addresses = []
active_status = []

try:
    perform_scraping(driver, search_url)

    for name in data['Corporation Name']:
        cleaned_name = clean_name(name)
        processed_name = handle_name_variations(cleaned_name)
        
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="SearchTerm"]'))
        )
        search_box.click()
        search_box.clear()
        search_box.send_keys(processed_name)
        search_box.send_keys(Keys.ENTER)

        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'td.large-width a'))
        )

        search_results = driver.find_elements(By.CSS_SELECTOR, 'td.large-width a')
        rows = driver.find_elements(By.XPATH, "//table/tbody/tr")
        cleaned_results = [clean_name(el.text) for el in search_results]

        agent_name, agent_address, status = fetch_agent_details(
            driver, processed_name, cleaned_results, search_results, rows
        )
        agent_names.append(agent_name)
        agent_addresses.append(agent_address)
        active_status.append(status)

except Exception as e:
    print(f"Error during scraping: {e}")
    driver.quit()
    exit()

## Save Results

In [None]:
try:
    results = pd.DataFrame({
        "Corp Name": data["Corporation Name"],
        "Agent Name": agent_names,
        "Address": agent_addresses,
        "Status": active_status
    })
    results.to_excel(output_file, index=False)
    print("Results saved successfully!")
except Exception as e:
    print(f"Error saving results: {e}")
finally:
    driver.quit()
    print("WebDriver closed.")