In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from tqdm import tqdm

# Base URL for rental listings in Pittsburgh
base_url = "https://www.realtor.com/apartments/Pittsburgh_PA"

# Set up headers to mimic a real browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

# Create a list to hold rental data
rental_data = []

# Specify the number of pages to scrape
num_pages = 30  # Adjust this based on the number of available pages

def extract_zip_code(address):
    """Extracts the zip code from the address string."""
    match = re.search(r'\b\d{5}\b', address)  # Regex to find a 5-digit zip code
    return match.group(0) if match else None

def average_price(price_text):
    """Calculates the average price from a price range string."""
    # Check if the price is a range (e.g., "$1,200 - $1,500")
    if " - " in price_text:
        prices = [float(p.replace('$', '').replace(',', '').strip()) for p in price_text.split(' - ')]
        return sum(prices) / len(prices)  # Return average price as a float
    else:
        return float(price_text.replace('$', '').replace(',', '').strip())  # Return single price as a float

# Use tqdm to create a progress bar
for page in tqdm(range(1, num_pages + 1), desc="Fetching Pages"):
    # Construct the URL for the current page
    url = f"{base_url}/pg-{page}"

    # Retry logic
    retries = 1
    for attempt in range(retries):
        try:
            # Send a GET request to the website
            response = requests.get(url, headers=headers)

            if response.status_code == 200:
                # Parse the HTML content
                soup = BeautifulSoup(response.content, 'html.parser')

                # Find all the listings on the page
                address_divs = soup.find_all('div', class_='truncate-line', attrs={'data-testid': 'card-address-2'})
                price_spans = soup.find_all('span', class_='base__StyledType-rui__sc-108xfm0-0 jywYrs')

                # Check if any addresses or prices were found
                if not address_divs or not price_spans:
                    print(f"No addresses or prices found on page {page}.")
                    break  # Exit if nothing is found

                # Iterate through addresses and prices
                for address, price in zip(address_divs, price_spans):
                    address_text = address.get_text(strip=True)
                    price_text = price.get_text(strip=True)  # Get the price text
                    rental_data.append({
                        'Address': address_text,
                        'Price': average_price(price_text),  # Use the average price function
                        'zipcode': extract_zip_code(address_text)  # Extract zip code
                    })

                # Respectful scraping: wait for a short period between requests
                time.sleep(0)  # Adjust this time as needed
                break  # Exit the retry loop if successful
            else:
                print(f"Failed to retrieve data from page {page}: {response.status_code}")
                time.sleep(2)  # Wait before retrying
        except requests.exceptions.RequestException as e:
            print(f"Connection error: {e}. Retrying...")
            time.sleep(2)  # Wait before retrying

# Convert the list to a DataFrame
rental_df = pd.DataFrame(rental_data)


# Save the DataFrame to a CSV file
rental_df.to_csv('pittsburgh_rental_prices.csv', index=False)

print("Data saved to pittsburgh_rental_prices.csv")


Fetching Pages:   0%|          | 0/30 [00:03<?, ?it/s]

KeyboardInterrupt

