In [1]:
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.google.com/'
    }

    with requests.Session() as session:
        session.headers.update(headers)
        try:
            response = session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        except requests.HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except Exception as err:
            print(f'Other error occurred: {err}')

def extract_prices(soup):
    prices = []
    for price in soup.find_all('span', 'PropertyCardWrapper__StyledPriceLine-srp__sc-16e8gqd-1 iMKTKr'):
        prices.append(price.text)
    return prices

def extract_locations(soup):
    locations = []
    for location in soup.find_all('a', 'StyledPropertyCardDataArea-c11n-8-84-3__sc-yipmu-0 jnnxAW property-card-link'):
        locations.append(location.text)
    return locations

def extract_size_sqft(soup):
    lot_sizes = []
    for lot_size in soup.find_all('ul', 'StyledPropertyCardHomeDetailsList-c11n-8-84-3__sc-1xvdaej-0 eYPFID'):
        parts = lot_size.text.split(' ')
        if len(parts) >= 3:
            # Adding commas between the elements
            formatted_text = ', '.join(parts)
            lot_sizes.append(formatted_text)
        else:
            lot_sizes.append(lot_size.text)  # In case of non-standard formats
    return lot_sizes

In [2]:
zip_code = '45801'
url = f'https://www.zillow.com/homes/{zip_code}/'
html_content = fetch_html(url)
html_content

<!DOCTYPE html>
<html lang="en"><head><link href="https://fonts.googleapis.com/css?family=Open+Sans:400,600,700&amp;display=swap" rel="stylesheet"/><style type="text/css">@font-face{font-display:swap;font-family:Ivar Headline;font-style:normal;font-weight:600;src:url(//www.zillowstatic.com/static-zsg/LATEST/static-zsg/zsg/z-fonts/ivar/IvarHeadline-SemiBold-extended.woff2) format(&quot;woff2&quot;),url(//www.zillowstatic.com/static-zsg/LATEST/static-zsg/zsg/z-fonts/ivar/IvarHeadline-SemiBold-extended.woff) format(&quot;woff&quot;);unicode-range:u+0000-001f,u+0080-200f,u+2020-faff,u+fb10-ffff}@font-face{font-display:swap;font-family:Ivar Headline;font-style:normal;font-weight:600;src:url(//www.zillowstatic.com/static-zsg/LATEST/static-zsg/zsg/z-fonts/ivar/IvarHeadline-SemiBold-core.woff2) format(&quot;woff2&quot;),url(//www.zillowstatic.com/static-zsg/LATEST/static-zsg/zsg/z-fonts/ivar/IvarHeadline-SemiBold-core.woff) format(&quot;woff&quot;);unicode-range:u+0020-007f,u+201?,u+fb0?}</sty

In [3]:
prices = extract_prices(html_content)
locations = extract_locations(html_content)
sizeSqft = extract_size_sqft(html_content)
print(prices, locations, sizeSqft)

['$249,900', '$162,500', '$99,900', '$149,900', '$119,900', '$85,000', '$144,900', '$299,000', '$62,000'] ['1141 W Bluelick Rd, Lima, OH 45801', '1636 Stewart Rd, Lima, OH 45801', '217 W Robb Ave, Lima, OH 45801', '3775 Slabtown Rd, Lima, OH 45801', '1148 Biscayne Dr, Lima, OH 45801', '831 W Eureka St, Lima, OH 45801', '2126 N Metcalf St, Lima, OH 45801', '5525 Sandusky Rd, Lima, OH 45801', '468 Haller St, Lima, OH 45801'] ['4, bds3, ba2,413, sqft', '3, bds3, ba1,296, sqft', '2, bds2, ba1,180, sqft', '3, bds1, ba1,107, sqft', '3, bds1, ba1,161, sqft', '2, bds1, ba900, sqft', '3, bds1, ba1,218, sqft', '5, bds3, ba2,619, sqft', '3, bds2, ba1,258, sqft']


In [4]:
# Post-process sizeSqft to extract bds, ba, and sqft
bds_sizes = [item.split(', ')[0] for item in sizeSqft]
ba_sizes = [item.split(', ')[1] for item in sizeSqft]
sqft_sizes = [item.split(', ')[2] for item in sizeSqft]


In [9]:
import pandas as pd
# Create a DataFrame
data = {
    'Price': prices,
    'Location': locations,
    'Bds': bds_sizes,
    'Ba': ba_sizes,
    'Sqft': sqft_sizes
}

df = pd.DataFrame(data)


In [10]:
# Extracting numeric values from 'Ba' and 'Sqft' columns
df['Ba'] = df['Ba'].str.extract('(\d+)')
df['Sqft'] = df['Sqft'].str.replace(',', '').str.extract('(\d+)')

# Convert the columns to numeric type
df['Ba'] = pd.to_numeric(df['Ba'])
df['Sqft'] = pd.to_numeric(df['Sqft'])

# Display the modified DataFrame
print(df)

      Price                            Location Bds  Ba  Sqft
0  $249,900  1141 W Bluelick Rd, Lima, OH 45801   4   3  2413
1  $162,500     1636 Stewart Rd, Lima, OH 45801   3   3  1296
2   $99,900      217 W Robb Ave, Lima, OH 45801   2   2  1180
3  $149,900    3775 Slabtown Rd, Lima, OH 45801   3   1  1107
4  $119,900    1148 Biscayne Dr, Lima, OH 45801   3   1  1161
5   $85,000     831 W Eureka St, Lima, OH 45801   2   1   900
6  $144,900   2126 N Metcalf St, Lima, OH 45801   3   1  1218
7  $299,000    5525 Sandusky Rd, Lima, OH 45801   5   3  2619
8   $62,000       468 Haller St, Lima, OH 45801   3   2  1258


In [None]:
# Save the modified DataFrame to a CSV file
#df.to_csv('45801.csv', index=False)