In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
import re
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

In [14]:
opts = Options()
ua = UserAgent()
user_agent = ua.random

opts.add_argument("user-agent=" + user_agent)
driver = webdriver.Chrome('/home/bryce/Downloads/chromedriver', options=opts)
apt_url = "https://www.apartments.com/5-murphy-ct-charleston-sc/mhkz90k/"
#apt_url = "https://www.apartments.com/2030-wildts-battery-blvd-johns-island-sc-unit-33847624/ks54vsb/"
driver.get(apt_url)
driver.maximize_window()

  driver = webdriver.Chrome('/home/bryce/Downloads/chromedriver', options=opts)


In [15]:
soup = BeautifulSoup(driver.page_source)

description_section = soup.find("section", class_="descriptionSection")


In [27]:
# For each apartment, we will write its info in this format:
PRINT_STR = "{addr};; ${rent};; {beds} bd;; {baths} ba;; {sqft} sq ft\n"
PRICING_GRID_ITEM = "pricingGridItem"
SCREEN_READER_ONLY = "screenReaderOnly"

def get_address_string(soup):
    address_divs = soup.find_all("div", class_="propertyAddressContainer")
    if (len(address_divs) == 0) :
        print("COULD NOT FIND ADDRESS")
        return "UNKNOWN ADDRESS"
    addr_string = " ".join(address_divs[0].text.split())

    # Some properties have a property name like "The Meadows". In these properties, the addr_string contains the street address.
    # For single houses and such, the property name is the street address, and the addr_string will be missing it.
    property_name_split = soup.find("h1", class_="propertyName").text.split()
    if len(property_name_split) == 0:
        print("WARNING: no property name found.")
        return addr_string
    # If the property name starts with a number, we'll assume it's an address.
    first_word = property_name_split[0]
    NUMBERS_DASH_OR_DOT = r'(\d+)(?:[\-\.]\d+)?'
    m = re.match(NUMBERS_DASH_OR_DOT, first_word)
    if m:
        property_name_split[0] = m.groups()[0]  # replace "43-45 Meeting St" with "43 Meeting St" for better latlng lookup
        street_address = " ".join(property_name_split)
        addr_string = street_address + ", " + addr_string

    return addr_string

def is_single_unit_listing(soup):
    # Does this page have a div of class "pricingGridItem"? If so, return false. If not, true
    pricing_grid_items = soup.find_all("div", class_=PRICING_GRID_ITEM)
    if (pricing_grid_items):
        return False
    return True

def save_info_from_single_unit_listing(soup, outfile):
    address = get_address_string(soup)
    rent_details = soup.find_all("p", class_="rentInfoDetail")
    printstr = address + ";; " + ";; ".join(map(lambda rd : rd.string if rd.string else " ", rent_details)) + "\n"
    outfile.write(printstr)

""" Some multi-unit listings have an exact price row for every
unit that's available. Otherse just have a price range for each floor plan.
This function handles the latter case.
We'll just save the price range as the price. When doing data analysis, we can
decide how to handle these cases."""
def save_less_precise_info_from_multi_unit_listing(soup, address, outfile):
    pricing_grid_items = soup.find_all("div", class_=PRICING_GRID_ITEM)
    for item in pricing_grid_items:
        classes = item.parent["class"]
        if 'active' not in classes:
            continue
        rent_range = item.find("span", class_="rentLabel").text.strip()
        ### print("rent range: ", rent_range)
        # The first detailsTextWrapper has the bed, bath, sq ft info
        other_info = item.find(class_="detailsTextWrapper").text
        other_info = other_info.replace("bed", "bd").replace("bath", "ba")
        formatted_info = ";; ".join([w.strip() for w in other_info.split(",")])
        println = "{address};; {rent_range};; {bd_ba_sqft}\n".format(address=address, rent_range=rent_range, bd_ba_sqft=formatted_info)
        outfile.write(println)


def save_info_from_multi_unit_listing(soup, outfile):
    #print("in get info from multi family listing")
    addr = get_address_string(soup)

    pricing_grid_items = soup.find_all("div", class_=PRICING_GRID_ITEM)
    for item in pricing_grid_items:
        classes = item.parent["class"]

        # This excludes not available listings and other listings that aren't shown to the user.
        if 'active' not in classes:
            continue
 
        data_lis = item.find_all("li", attrs={"data-beds" : re.compile(r'.*') })
        
        # Ah ha! This is one of those pages with just a price range
        if len(data_lis) == 0:
            save_less_precise_info_from_multi_unit_listing(soup, addr, outfile)
            return

        BEDS_ATTR = "data-beds"
        BATHS_ATTR = "data-baths"
        for i in range(len(data_lis)):
            ## print("i: ", i)
            li = data_lis[i]
            # The number of beds and baths are attributes of the <li>
            beds = li[BEDS_ATTR]
            baths = li[BATHS_ATTR]

            #Get price
            price_column = li.find("div", class_="pricingColumn")
            price_spans = price_column.find_all("span")
            assert len(price_spans) == 2, "Expected two spans in price column"
            price = price_spans[1].text.strip()

            
            # Get square footage
            sqft_column = li.find("div", class_="sqftColumn")
            sqft_text = sqft_column.text
            sqft_list = map(lambda s: s.replace(",", ""), sqft_text.split())
            sqft_list = [s for s in sqft_list if s.isdigit()]
            sqft = -1
            assert len(sqft_list) == 1, "UNEXPECTED square footage: " + sqft_text
            sqft = sqft_list[0]
            printstr = PRINT_STR.format(addr=addr, rent=price, beds=beds, baths=baths, sqft=sqft)
            outfile.write(printstr)

def get_info_from_listing(driver, apartment_outfile, location_outfile):
    #print("in get_info_from_listing")
    soup = BeautifulSoup(driver.page_source)
    url = driver.current_url
    save_per_location_info(soup, url, location_outfile)
    if is_single_unit_listing(soup):
        try:
            save_info_from_single_unit_listing(soup, apartment_outfile)
        except Exception as e:
            print("Could not extract info from single-unit listing. Got Exception: ", e)
    else:
        try:
            save_info_from_multi_unit_listing(soup, apartment_outfile)
        except Exception as e:
            print("Could not extract info from multi-unit listing. Got Exception: ", e)

   



In [38]:
m = re.match(r'(\d+)(?:[\-\.]\d+)?', '45')
m.groups()[0]

'45'

In [29]:
apts_outfile = open("/tmp/test.csv" , "w")
addr = get_address_string(soup)

save_info_from_single_unit_listing(soup=soup, outfile=apts_outfile)

apts_outfile.close()

In [19]:
import importlib
apts = importlib.import_module("ipynb.fs.full.scraping_apartments-dot-com")


WebDriverException: Message: unknown error: cannot determine loading status
from target frame detached
  (Session info: chrome=109.0.5414.74)
Stacktrace:
#0 0x55976d913303 <unknown>
#1 0x55976d6e7bbd <unknown>
#2 0x55976d6d2233 <unknown>
#3 0x55976d6d0c77 <unknown>
#4 0x55976d6d1408 <unknown>
#5 0x55976d6dea1a <unknown>
#6 0x55976d6df2d2 <unknown>
#7 0x55976d6effd0 <unknown>
#8 0x55976d6f434b <unknown>
#9 0x55976d6d19c5 <unknown>
#10 0x55976d6efbd2 <unknown>
#11 0x55976d75caa0 <unknown>
#12 0x55976d744753 <unknown>
#13 0x55976d717a14 <unknown>
#14 0x55976d718b7e <unknown>
#15 0x55976d96232e <unknown>
#16 0x55976d965c0e <unknown>
#17 0x55976d948610 <unknown>
#18 0x55976d966c23 <unknown>
#19 0x55976d93a545 <unknown>
#20 0x55976d9876a8 <unknown>
#21 0x55976d987836 <unknown>
#22 0x55976d9a2d13 <unknown>
#23 0x7fd498254609 start_thread


In [13]:
ps = description_section.find_all("p")
p2 = ps[1]
p2.attrs 
len(p2.attrs)

1

In [None]:
SEMICOLON_REPLACEMENT = ',,,'
def replace_semicolons(text):
    text = text.replace(';', SEMICOLON_REPLACEMENT)
    return text

def add_semicolons_back(text):
    text = text.replace(SEMICOLON_REPLACEMENT, ';')
    return text

In [14]:
a = 'a string;'
b = a.encode(encoding='ascii')
c = a.encode()

print(b)
print(c)

b'a string;'
b'a string;'
