# Web Scraping Apartments Website

View apartments in a city of your choice to see what is available by way of amenities, price, and number of rooms. It really is customizable to your own needs so feel free to play around with it if you like. 

In [None]:
# Make our necessary imports
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import StaleElementReferenceException

## Instantiate all our variables
**Note** `city-state` must be defined in a specific manner as noted in the code, while '1-bedrooms' in the link can be modified to fit your own needs. I just needed the 1-bedrooms though

In [None]:
# Unique amenities is to make each one its own column
unique_amenities = set()
data = []
# `city_state` must be formatted in the following way: 'atlanta-ga'
city_state = ''

# Assign for our driver
service = Service(executable_path = 'C:/Users/Brady/anaconda3/chromedriver.exe')
options = webdriver.ChromeOptions()

# Create the WebDriver using ChromeOptions
driver = webdriver.Chrome(service = service, options = options)
driver.get('https://www.apartments.com/{}/1-bedrooms'.format(city_state))


In [None]:
# Loop through all the pages while there is another page
while True:
    
    # Create the lxml of each page and find each post (postings)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    postings = soup.find_all('li', class_ = 'mortar-wrapper')

    # Loop through each post on the current page
    for post in postings: 
        
        # Get the link of the website, the apartment complex name, 
        # address, price, and amenities of each listing
        link = post.find('a', class_ = 'property-link').get('href')
        apt_complex = post.find('span', class_ = 'js-placardTitle title').text
        try:
            address = post.find('div', class_ = 'property-address js-url').text
        except:
            address = 'N/A'
        try:
            price = post.find('p', class_ = 'property-pricing').text
        except:
            price = 'N/A'
        try:
            amenities = post.find('p', class_ = 'property-amenities').text.strip()
            # Split the input string into separate lines
            amenities_list = amenities.split('\n')
            # Remove empty lines
            amenities_list = [amenity.strip() for amenity in amenities_list if amenity.strip()]
            unique_amenities.update(amenities_list)
        except:
            amenities = 'N/A'
        data.append({'Apartment Complex' : apt_complex, 'Website' : link, 'Address' : address,
                    'Price' : price, 'Amenities' : amenities_list})
    try:
        driver.find_element(By.CLASS_NAME, 'next ').click()
        sleep(5)
    except:
        break

In [None]:
# Turn our data list of dictionaries into a dataframe
df = pd.DataFrame(data)

# Turn each amenity into its own column
for amenity in unique_amenities:
    df[amenity] = df['Amenities'].apply(lambda am: int(amenity in am))
df = df.drop(columns=['Amenities'])

In [None]:
# Create separate columns for lower and upper prices
df[['Lower Price', 'Upper Price']] = df['Price'].str.split('-', expand=True)
df = df.drop(columns=['Price'])

In [None]:
df = pd.DataFrame(data)
unique_amenities
df

In [None]:
# Write our dataframe to a folder in order to access from Power BI
df.to_csv('C:/Users/Brady/Web Scraping/plano-apartments.csv', index = False)

In [None]:
# Preprocess the data
complex_data = []
for entry in data:
    complex_data.append({
        "Name": entry["Apartment Complex"],
        "Website": entry["Website"],
        "Address": entry["Address"],
        "Price": entry["Price"],
        "Amenities" : entry["Amenities"]
        })

# Create a dictionary to map amenities to IDs
amenity_id_map = {}
next_amenity_id = 1

# Process amenities and create ID mapping
for comp in complex_data:
    for amenity in comp["Amenities"]:
        if amenity not in amenity_id_map:
            amenity_id_map[amenity] = next_amenity_id
            next_amenity_id += 1

# Process amenities and create associations
complex_amenities_data = []
for comp in complex_data:
    complex_name = comp["Name"]
    for amenity in comp["Amenities"]:
        complex_amenities_data.append({
            "ComplexName": complex_name,
            "AmenityID": amenity_id_map[amenity],  # Use the ID from the mapping
            "AmenityName": amenity
        })


In [None]:
# Create a DataFrame from the amenity_id_map dictionary
complex_amenities = pd.DataFrame.from_dict(amenity_id_map, orient='index', columns=['AmenityID'])

# Reset the index to make the amenity names a column
complex_amenities.reset_index(inplace=True)
complex_amenities.rename(columns={'index': 'AmenityName'}, inplace=True)

# Save them to dataframes
apt_complex = pd.DataFrame(complex_data)
complex_amenities_map = pd.DataFrame(complex_amenities_data)

In [None]:
# Write all your dataframes to a local csv file

complex_amenities.to_csv('C:/Users/Brady/Web Scraping/complex_amenities.csv', index = False)
apt_complex.to_csv('C:/Users/Brady/Web Scraping/apts.csv', index = False)
complex_amenities_map.to_csv('C:/Users/Brady/Web Scraping/copmlex_amenities_map.csv', index = False)