In [35]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from fake_useragent import UserAgent
import re
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

You're going to need one more function to tell if we should keep clicking next page.

Look at all the li -> a's in the nav bar at the botton. Find the next to last one (meaning the last number before the right arrow). If its title contains "Next page", you're done clicking. Otherwise, keep clicking.

In [55]:
opts = Options()
ua = UserAgent()
user_agent = ua.random

opts.add_argument("user-agent=" + user_agent)
driver = webdriver.Chrome('/home/bryce/Downloads/chromedriver', options=opts)
# This url is just the map of the charleston area. It's zoomed out and positioned so that it captures pretty much all apts in the Charleston/N. Charleston/Mt. P/James Island/Johns Island area,
# but excludes most of Summerville and beyond. Summerville seems like an entirely different market, so I think it makes sense to exclude it.
apt_url = 'https://www.apartments.com/?bb=56oym0-ppH44p590b'
#apt_url = "https://www.apartments.com/charleston-sc/"
#apt_url = "https://www.apartments.com/2030-wildts-battery-blvd-johns-island-sc-unit-33847624/ks54vsb/"
driver.maximize_window()
driver.get(apt_url)


  driver = webdriver.Chrome('/home/bryce/Downloads/chromedriver', options=opts)


In [37]:
from random import randint
from time import sleep

def click_on_element(driver, element):
    sleep(randint(1,4))
    actions = ActionChains(driver)
    actions.move_to_element(element)
    actions.move_by_offset(randint(2, 10), randint(2,10))
    actions.pause(2)
    actions.click()
    actions.perform()

In [47]:
# For each apartment, we will write its info in this format:
PRINT_STR = "{addr};; ${rent};; {beds} bd;; {baths} ba;; {sqft} sq ft\n"
PRICING_GRID_ITEM = "pricingGridItem"
SCREEN_READER_ONLY = "screenReaderOnly"

def get_address_string(soup):
    address_divs = soup.find_all("div", class_="propertyAddressContainer")
    if (len(address_divs) == 0) :
        print("COULD NOT FIND ADDRESS")
        return "UNKNOWN ADDRESS"
    addr_string = " ".join(address_divs[0].text.split())

    # Some properties have a property name like "The Meadows". In these properties, the addr_string contains the street address.
    # For single houses and such, the property name is the street address, and the addr_string will be missing it.
    property_name_split = soup.find("h1", class_="propertyName").text.split()
    if len(property_name_split) == 0:
        print("WARNING: no property name found.")
        return addr_string
    # If the property name starts with a number, we'll assume it's an address.
    first_word = property_name_split[0]
    NUMBERS_DASH_OR_DOT = r'(\d+)(?:[\-\.]\d+)?'
    m = re.match(NUMBERS_DASH_OR_DOT, first_word)
    if m:
        property_name_split[0] = m.groups()[0]  # replace "43-45 Meeting St" with "43 Meeting St" for better latlng lookup
        street_address = " ".join(property_name_split)
        addr_string = street_address + ", " + addr_string

    return addr_string

def is_single_unit_listing(soup):
    # Does this page have a div of class "pricingGridItem"? If so, return false. If not, true
    pricing_grid_items = soup.find_all("div", class_=PRICING_GRID_ITEM)
    if (pricing_grid_items):
        return False
    return True

def save_info_from_single_unit_listing(soup, outfile):
    address = get_address_string(soup)
    rent_details = soup.find_all("p", class_="rentInfoDetail")
    printstr = address + ";; " + ";; ".join(map(lambda rd : rd.string if rd.string else "?", rent_details)) + "\n"
    outfile.write(printstr)

""" Some multi-unit listings have an exact price row for every
unit that's available. Otherse just have a price range for each floor plan.
This function handles the latter case.
We'll just save the price range as the price. When doing data analysis, we can
decide how to handle these cases."""
def save_less_precise_info_from_multi_unit_listing(soup, address, outfile):
    pricing_grid_items = soup.find_all("div", class_=PRICING_GRID_ITEM)
    for item in pricing_grid_items:
        classes = item.parent["class"]
        if 'active' not in classes:
            continue
        rent_range = item.find("span", class_="rentLabel").text.strip()
        ### print("rent range: ", rent_range)
        # The first detailsTextWrapper has the bed, bath, sq ft info
        other_info = item.find(class_="detailsTextWrapper").text
        other_info = other_info.replace("bed", "bd").replace("bath", "ba")
        formatted_info = ";; ".join([w.strip() for w in other_info.split(",")])
        println = "{address};; {rent_range};; {bd_ba_sqft}\n".format(address=address, rent_range=rent_range, bd_ba_sqft=formatted_info)
        outfile.write(println)


def save_info_from_multi_unit_listing(soup, outfile):
    #print("in get info from multi family listing")
    addr = get_address_string(soup)

    pricing_grid_items = soup.find_all("div", class_=PRICING_GRID_ITEM)
    for item in pricing_grid_items:
        classes = item.parent["class"]

        # This excludes not available listings and other listings that aren't shown to the user.
        if 'active' not in classes:
            continue
 
        data_lis = item.find_all("li", attrs={"data-beds" : re.compile(r'.*') })
        
        # Ah ha! This is one of those pages with just a price range
        if len(data_lis) == 0:
            save_less_precise_info_from_multi_unit_listing(soup, addr, outfile)
            return

        BEDS_ATTR = "data-beds"
        BATHS_ATTR = "data-baths"
        for i in range(len(data_lis)):
            ## print("i: ", i)
            li = data_lis[i]
            # The number of beds and baths are attributes of the <li>
            beds = li[BEDS_ATTR]
            baths = li[BATHS_ATTR]

            #Get price
            price_column = li.find("div", class_="pricingColumn")
            price_spans = price_column.find_all("span")
            assert len(price_spans) == 2, "Expected two spans in price column"
            price = price_spans[1].text.strip()

            
            # Get square footage
            sqft_column = li.find("div", class_="sqftColumn")
            sqft_text = sqft_column.text
            sqft_list = map(lambda s: s.replace(",", ""), sqft_text.split())
            sqft_list = [s for s in sqft_list if s.isdigit()]
            sqft = -1
            assert len(sqft_list) == 1, "UNEXPECTED square footage: " + sqft_text
            sqft = sqft_list[0]
            printstr = PRINT_STR.format(addr=addr, rent=price, beds=beds, baths=baths, sqft=sqft)
            outfile.write(printstr)

def get_info_from_listing(driver, apartment_outfile, location_outfile):
    #print("in get_info_from_listing")
    soup = BeautifulSoup(driver.page_source)
    url = driver.current_url
    save_per_location_info(soup, url, location_outfile)
    if is_single_unit_listing(soup):
        try:
            save_info_from_single_unit_listing(soup, apartment_outfile)
        except Exception as e:
            print("Could not extract info from single-unit listing. Got Exception: ", e)
    else:
        try:
            save_info_from_multi_unit_listing(soup, apartment_outfile)
        except Exception as e:
            print("Could not extract info from multi-unit listing. Got Exception: ", e)

   



In [39]:
# This way seems to work better.

soup = BeautifulSoup(driver.page_source)
amenities = []
amenities_section = soup.find('section', class_='amenitiesSection')
bullet_lis = amenities_section.find_all('li', class_='specInfo')
for li in bullet_lis:
    amenity = '()' + li.text.replace('\n', '')
    amenities.append(amenity)

print(amenities)
print(get_address_string(soup))

# # This way misses some amenities. Let's see if another way will catch them all (and work for all the other cases too)
# soup = BeautifulSoup(driver.page_source)
# amenities = []
# combined_amenities_uls = soup.find_all('ul', class_='combinedAmenitiesList')
# for ul in combined_amenities_uls:
#     bullet_lis = ul.find_all('li', class_='specInfo')
#     for li in bullet_lis:
#         amenity = '()' + li.text.replace('\n', '')
#         amenities.append(amenity)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [33]:
title = soup.find('title')
title_str = title.text.replace('| Apartments.com', '')
print(title_str)

 105 Ivy Grn Wy Unit 1-1023.830916, Charleston, SC 29414 - Apartment for Rent in Charleston, SC 


In [53]:
SEMICOLON_REPLACEMENT = ',,,'
NEWLINE_REPLACEMENT = '   '
def replace_semicolons(text):
    return text.replace(';', SEMICOLON_REPLACEMENT)

def add_semicolons_back(text):
    return text.replace(SEMICOLON_REPLACEMENT, ';')

def replace_newlines(text):
    return text.replace('\n', NEWLINE_REPLACEMENT)

def add_newlines_back(text):
    return text.replae(NEWLINE_REPLACEMENT, '\n')

def get_combined_amenities(soup):
    amenities = []
    amenities_section = soup.find('section', class_='amenitiesSection')
    if amenities_section is None:
        return []
    bullet_lis = amenities_section.find_all('li', class_='specInfo')
    for li in bullet_lis:
        amenity = '()' + li.text.replace('\n', '')
        amenities.append(amenity)
        
    return amenities 

def get_page_title(soup):
    title = soup.find('title')
    title_str = title.text.replace('| Apartments.com', '')
    return(title_str)

def get_unique_amenities(soup):
    soup = BeautifulSoup(driver.page_source)
    unique_amenity_lis = soup.find_all('li', class_='uniqueAmenity')
    amenities = []
    for li in unique_amenity_lis:
        amenity = '()' + li.text.replace('\n', '')
        amenities.append(amenity)
    return amenities


def get_bullet_points(soup):
    combined_amenities = get_combined_amenities(soup)
    unique_amenities = get_unique_amenities(soup)
    amenities = combined_amenities if combined_amenities else unique_amenities
    if not combined_amenities and unique_amenities:
        print("Could not find combined_amenities, but did find unique_amenities")
    return '. '.join(amenities) if amenities else ''

def save_per_location_info(soup, url, outfile):
    address = get_address_string(soup)
    address_without_semis = replace_semicolons(address)
    if (address != address_without_semis):
        print("WARNING, address " + address + " has semicolons, for url: ", url)
    description_section = soup.find("section", class_="descriptionSection")
    blurb = ''
    if description_section is None:
        print ('Missing descriptionSection')
    else:
        ps = description_section.find_all("p")
        if ps is None:
            print('No p tag in descriptionSection')
        else:
            p1 = ps[0]
            if len(p1.attrs) > 0:
                print("WARNING, first <p> of description section has attrs for url: ", url)
            blurb = p1.text
            if len(blurb) == 0:
                print("WARNING, this page's blurb was length 0, for url ", url)
            blurb = replace_newlines(replace_semicolons(blurb))
    bullets = get_bullet_points(soup)
    bullets = replace_newlines(replace_semicolons(bullets))
    title = replace_semicolons(get_page_title(soup))
    saveline = "{address}; {url}; {title}; {bullets}; {blurb}\n".format(address=address_without_semis, url=url, title=title, bullets=bullets, blurb=blurb)
    outfile.write(saveline)

    

In [5]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(driver.page_source)
print(get_address_string(soup))


NameError: name 'driver' is not defined

In [51]:
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
from random import randint

def is_page_last(driver):
    page_range = driver.find_element(By.CLASS_NAME, "pageRange")
    page_range_text = page_range.text
    numbers = [int(w) for w in page_range_text.split() if w.isdigit()]
    assert len(numbers) == 2, "We expect the page range to have two numbers in it, but apparently it doesn't: " + page_range_text
    return numbers[0] == numbers[1]

def click_to_next_page(driver):
    next_page_link = driver.find_element(By.CSS_SELECTOR, "a.next")
    sleep(randint(3,5))
    actions = ActionChains(driver)
    actions.move_to_element_with_offset(next_page_link, randint(1,10), randint(1,10))
    actions.pause(randint(1,3))
    actions.click()
    actions.perform()

#click_to_next_page(driver)
    

In [50]:
import random

def save_all_results_from_page(driver, apts_outfile, addr_outfile):
    apts_outfile.write("\n\n")
    sleep(random.randint(5,9))
    count = 0
    maxcount = 1000  # set to 1000 unless debugging
    while True:
        links = driver.find_elements(By.CSS_SELECTOR, "div.item.active.us")
        if count >= len(links) or count > maxcount:
            print("count: ", count)
            print("number of links: ", len(links))
            break
        link = links[count]
        click_on_element(driver, link)
        sleep(random.random() + randint(1,2))
        get_info_from_listing(driver, apts_outfile, addr_outfile)
        number_appendix_map = {1 : 'st', 2 : 'nd', 3: 'rd', 11 : 'th', 12 : 'th'}
        number_to_display = count + 1
        appendix = number_appendix_map.get(number_to_display % 10, 'th')
        print("Saved " + str(number_to_display) + "'" + appendix + " result.")
        driver.back()
        sleep(random.random() + randint(1,2))
        count += 1
    

In [34]:
apt_outfile = open("/tmp/apt_results_p7.csv", "w")
addr_outfile = open("/tmp/addr_results_p7.csv", "w")
save_all_results_from_page(driver, apt_outfile, addr_outfile)
apt_outfile.close()
addr_outfile.close()

Saved 1'st result.
Saved 2'nd result.
Saved 3'rd result.
Saved 4'th result.
Saved 5'th result.
count:  5
number of links:  25


In [49]:
def save_all_results(driver, apt_outfile, addr_outfile):
    page = 1
    while True:
        print("About to save results from page ", page)
        save_all_results_from_page(driver, apt_outfile, addr_outfile)
        print("Finished saving page ", page)
        if is_page_last(driver):
            print("It's the last page!")
            break
        print("Clicking to next page...")
        click_to_next_page(driver)
        page += 1
            


In [56]:
apt_outfile = open("/home/bryce/Projects/Data_Science/Apt_Prices/apt_scraping_results.csv", "w")
addr_outfile = open("/home/bryce/Projects/Data_Science/Apt_Prices/addr_scraping_results.csv", "w")

save_all_results(driver, apt_outfile, addr_outfile)
apt_outfile.close()
addr_outfile.close()

About to save results from page  1
Saved 1'st result.
Saved 2'nd result.
Saved 3'rd result.
Saved 4'th result.
Saved 5'th result.
Saved 6'th result.
Saved 7'th result.
Saved 8'th result.
Saved 9'th result.
Saved 10'th result.
Saved 11'st result.
Saved 12'nd result.
Saved 13'rd result.
Saved 14'th result.
Saved 15'th result.
Saved 16'th result.
Saved 17'th result.
Saved 18'th result.
Saved 19'th result.
Saved 20'th result.
Saved 21'st result.
Saved 22'nd result.
Saved 23'rd result.
Saved 24'th result.
Saved 25'th result.
count:  25
number of links:  25
Finished saving page  1
Clicking to next page...
About to save results from page  2
Saved 1'st result.
Saved 2'nd result.
Saved 3'rd result.
Saved 4'th result.
Saved 5'th result.
Saved 6'th result.
Saved 7'th result.
Saved 8'th result.
Saved 9'th result.
Saved 10'th result.
Saved 11'st result.
Saved 12'nd result.
Saved 13'rd result.
Saved 14'th result.
Saved 15'th result.
Saved 16'th result.
Saved 17'th result.
Saved 18'th result.
Saved 1

In [None]:
outfile = open("/home/bryce/Projects/Data_Science/Apt_Prices/all-results.csv", "w")

save_all_results(driver, outfile)
outfile.close()

In [None]:
out_file = open('/tmp/aptscom.csv', 'w')
print("hi")
get_info_from_listing(driver, out_file)
print("bye")
out_file.close()  