In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
import time

# Initialize Selenium WebDriver (Chrome)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Runs Chrome in headless mode (no UI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Go to the webpage with Selenium
url = "https://www.meganslaw.psp.pa.gov/Search/MileRadiusSearch"
driver.get(url)

wait = WebDriverWait(driver, 10) 

# Automate filling in the search form
try:
    # Find and click the "Accept" button (targeting the form and the button class)
    accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept"]')))
    accept_button.click()

    print("Clicked the Accept button. Now proceeding to the search page...")
    address_field = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.ID, 'enteredAddr1')))
    driver.execute_script("arguments[0].scrollIntoView(true);", address_field) 
    driver.execute_script("arguments[0].value = 'Hamburgh Hall';", address_field)

    #address_field= wait.until(EC.visibility_of_element_located((By.ID, 'enteredAddr1')))
    #address_field.send_keys("Hamburgh Hall")

    # Select the city
    city_field = wait.until(EC.visibility_of_element_located((By.ID, 'selectedCity')))
    city_field.send_keys("PITTSBURGH")

    # Input ZIP code
    zip_field = wait.until(EC.visibility_of_element_located((By.ID, 'enteredZip')))
    zip_field.send_keys("15213")

    # Select radius (in miles)
    mile_radius_dropdown = Select(driver.find_element(By.ID, 'MileRadiusDDL'))
    mile_radius_dropdown.select_by_visible_text("3 Miles")

    # Submit the search form (adjust the submit button identifier as needed)
    submit_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, 'MileRadiusSearchResults')))
    driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
    driver.execute_script("arguments[0].click();", submit_button)

    # Wait for the results page to load
    time.sleep(5)

    # Now proceed to scrape the result page
    offenders = driver.find_elements(By.CLASS_NAME, 'row.searchResultRow')

    # Initialize a list to store the extracted data
    offender_data = []

    has_next_page = True
    
    while True:
        time.sleep(5)

        # Scrape the result page
        offenders = driver.find_elements(By.CLASS_NAME, 'row.searchResultRow')

        # Loop through each offender and extract details
        for offender in offenders:
            name = offender.find_element(By.CLASS_NAME, 'searchResultName').text
            tier = offender.find_elements(By.CLASS_NAME, 'gridDataItem.br-responsive-sm')[0].text
            birth_year = offender.find_elements(By.CLASS_NAME, 'gridDataItem.br-responsive-sm')[1].text
            address_block = offender.find_element(By.CLASS_NAME, 'searchResultAddress').text.split("\n")

            # Append the extracted data
            offender_data.append({
                'Name': name,
                'Tier': tier,
                'Birth Year': birth_year,
                'Address': address_block[0],  # Street Address
                'City/State/ZIP': address_block[1]  # City/State/ZIP
            })

        try:
            next_button_li = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'PagedList-skipToNext'))
            )

            # Check if the "Next" button is disabled by looking at its class or missing href
            if "disabled" in next_button_li.get_attribute("class"):
                print("No more pages, 'Next' button is disabled.")
                break  # No more pages, stop the loop

            # Find the anchor tag inside the next button and click it
            next_link = next_button_li.find_element(By.TAG_NAME, 'a')
            next_href = next_link.get_attribute('href')  # Get the URL for the next page

            print(f"Navigating to next page: {next_href}")
            driver.execute_script("arguments[0].click();", next_link)  # Trigger the click via JavaScript

        except Exception as e:
            print("No more pages or error encountered.")
            break  # No more next button, stop the loop
    # Close the browser when done
    driver.quit()

    # Print the extracted data
    if offender_data:
        for data in offender_data:
            print(data)
    else:
        print("No data found.")
    
except Exception as e:
    print(f"Error during scraping: {e}")
    driver.quit()


Clicked the Accept button. Now proceeding to the search page...
Navigating to next page: https://www.meganslaw.psp.pa.gov/Search/MileRadiusSearchResultsAsync?page=2&enteredAddr1=Hamburgh%20Hall&selectedCity=PITTSBURGH&enteredZip=15213&selectedMileRadius=3&selectedSortBy=1&chkMileRadiusIncarcerated=False
Navigating to next page: https://www.meganslaw.psp.pa.gov/Search/MileRadiusSearchResultsAsync?page=3&enteredAddr1=Hamburgh%20Hall&selectedCity=PITTSBURGH&enteredZip=15213&selectedMileRadius=3&selectedSortBy=1&chkMileRadiusIncarcerated=False
Navigating to next page: https://www.meganslaw.psp.pa.gov/Search/MileRadiusSearchResultsAsync?page=4&enteredAddr1=Hamburgh%20Hall&selectedCity=PITTSBURGH&enteredZip=15213&selectedMileRadius=3&selectedSortBy=1&chkMileRadiusIncarcerated=False
Navigating to next page: https://www.meganslaw.psp.pa.gov/Search/MileRadiusSearchResultsAsync?page=5&enteredAddr1=Hamburgh%20Hall&selectedCity=PITTSBURGH&enteredZip=15213&selectedMileRadius=3&selectedSortBy=1&chkM

In [16]:
offender_data

[{'Name': 'ABDI, MOHAMED HUSSEIN',
  'Tier': 'Tier 2',
  'Birth Year': '1971',
  'Address': '2923 PENN AVE',
  'City/State/ZIP': 'PITTSBURGH, PA 15201'},
 {'Name': 'ABERNATHY, POSHAUNTAMAR',
  'Tier': 'Tier 2',
  'Birth Year': '1979',
  'Address': '757 HAZELWOOD AVE',
  'City/State/ZIP': 'APT #2'},
 {'Name': 'ABERNATHY, RASUL ALI',
  'Tier': 'Tier 2',
  'Birth Year': '1981',
  'Address': '757 HAZELWOOD AVENUE',
  'City/State/ZIP': 'PITTSBURGH, PA 15217'},
 {'Name': 'ALLEN, JEFFREY',
  'Tier': 'Lifetime',
  'Birth Year': '1962',
  'Address': '414 SAINT JOSEPH STREET',
  'City/State/ZIP': 'PITTSBURGH, PA 15210'},
 {'Name': 'AMARO, MIGUEL VEGA',
  'Tier': 'Out-of-State',
  'Birth Year': '1981',
  'Address': 'TRANSIENT-HOMELESS',
  'City/State/ZIP': 'AREA OF PITTSBURGH CITY'},
 {'Name': 'ANDREEN, KEITH BRIAN',
  'Tier': 'Out-of-State',
  'Birth Year': '1991',
  'Address': '3229 JOSEPHINE STREET',
  'City/State/ZIP': 'PITTSBURGH, PA 15203'},
 {'Name': 'ARASIN, ROBERT ALEX',
  'Tier': 'Tier 

In [18]:
import pandas as pd

# Function to clean the data
def clean_data(data):
    cleaned_data = []

    # Iterate over each offender's data (a dictionary)
    for offender in data:
        cleaned_offender = {}

        # Clean the 'Name' field
        cleaned_offender['Name'] = offender.get('Name', '').strip().title()

        # Clean the 'Tier' field (ensure it is consistent in case of typos or other formatting issues)
        cleaned_offender['Tier'] = offender.get('Tier', '').strip().capitalize()

        # Clean the 'Birth Year' (ensure it is numeric and not malformed)
        birth_year = offender.get('Birth Year', '').strip()
        if birth_year.isdigit():
            cleaned_offender['Birth Year'] = int(birth_year)  # Convert to integer
        else:
            cleaned_offender['Birth Year'] = None  # Set to None if invalid

        # Clean the 'Address' field (remove any extra spaces or unwanted characters)
        cleaned_offender['Address'] = offender.get('Address', '').strip().title()

        # Clean the 'City/State/ZIP' field (address cleanup)
        city_state_zip = offender.get('City/State/ZIP', '').strip()

        # Check if the format has comma separation, else handle other formats
        if ',' in city_state_zip:
            city_state_zip_parts = city_state_zip.split(',')
            cleaned_offender['City'] = city_state_zip_parts[0].strip().title()

            # Sometimes state and ZIP are combined in various formats (like "PA 15213")
            state_zip = city_state_zip_parts[1].strip().split(' ')
            if len(state_zip) == 2:
                cleaned_offender['State'] = state_zip[0].strip().upper()
                cleaned_offender['ZIP'] = state_zip[1].strip()
            else:
                cleaned_offender['State'] = state_zip[0].strip().upper() if state_zip else None
                cleaned_offender['ZIP'] = None
        else:
            # If no comma, we assume this field is not well-formed. Assign it as is.
            cleaned_offender['City/State/ZIP'] = city_state_zip

        # Append the cleaned offender's data to the list
        cleaned_data.append(cleaned_offender)

    return cleaned_data

# Use the real scraped offender data from your Selenium script
cleaned_offender_data = clean_data(offender_data)

# Convert to a pandas DataFrame for easier CSV export
df = pd.DataFrame(cleaned_offender_data)

# Export the cleaned data to a CSV file
df.to_csv('cleaned_offender_data.csv', index=False)

print("Data cleaned and exported to cleaned_offender_data.csv")


Data cleaned and exported to cleaned_offender_data.csv


In [22]:
import pandas as pd
cleaned_offender_data = pd.read_csv('cleaned_offender_data.csv')
cleaned_offender_data.head()

Unnamed: 0,Name,Tier,Birth Year,Address,City,State,ZIP,City/State/ZIP
0,"Abdi, Mohamed Hussein",Tier 2,1971,2923 Penn Ave,Pittsburgh,PA,15201.0,
1,"Abernathy, Poshauntamar",Tier 2,1979,757 Hazelwood Ave,,,,APT #2
2,"Abernathy, Rasul Ali",Tier 2,1981,757 Hazelwood Avenue,Pittsburgh,PA,15217.0,
3,"Allen, Jeffrey",Lifetime,1962,414 Saint Joseph Street,Pittsburgh,PA,15210.0,
4,"Amaro, Miguel Vega",Out-of-state,1981,Transient-Homeless,,,,AREA OF PITTSBURGH CITY
