In [None]:
import pandas as pd
from tqdm import tqdm
from plotnine import *

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

driver = webdriver.Chrome("../../Data Science/chromedriver")

In [None]:
wa_cities_page = "https://www.apartments.com/sitemap/washington/cities/"
driver.get(wa_cities_page)

In [None]:
hrefs = driver.find_elements(By.XPATH,"//a[@href]")
wa_city_links = []
for x in hrefs:
    link = x.get_attribute("href")
    if link.endswith("-wa/"):
        wa_city_links.append(link)
wa_city_links = sorted(list(set(wa_city_links)))
wa_city_links[:10]

In [None]:
def get_single_property_info(driver,link):
        driver.get(link)
        try: built_date = driver.find_element(By.XPATH,"//div[contains( text( ), 'Built in')]").text
        except: built_date = None
        try: 
            unit_counts = driver.find_elements(By.XPATH,"//div[contains( text( ), 'unit')]")
            unit_count = [x.text for x in unit_counts if "available" not in x.text.lower() and 'units' in x.text.lower()][0]
        except:
            unit_count = None
        property_name = driver.find_element(By.ID,'propertyName').text
        try:
            address_elem = driver.find_elements(By.CLASS_NAME,"propertyAddressContainer")
            address,neighborhood = address_elem[0].text.split("\n")
        except:
            neighborhood = driver.find_element(By.CLASS_NAME,'neighborhood').text
            address = property_name
        single_property_info = {'built_date':built_date,'unit_count':unit_count,'address':address,
                                'neighborhood':neighborhood,'property_name':property_name}
        return(single_property_info)

In [None]:
all_property_info = dict()
all_property_links = []
# Loop through cities
for i in tqdm(range(len(wa_city_links))):
    city_link = wa_city_links[i]
    # Apartments/condos only
    city_link = city_link.replace("apartments.com/","apartments.com/apartments-condos/")
    driver.get(city_link)
    # Check for no results
    try:
        no_results_text = driver.find_element(By.CLASS_NAME,"no-results").text
        if "NO RESULTS FOUND" in no_results_text:
            #print("No results found",city_link)
            continue
    except:
        pass
    # Get count of pages for each city
    try:
        page_range = driver.find_element(By.CLASS_NAME,"pageRange").text
        n_pages = int(page_range.split()[-1])
    except:
        n_pages = 1
    # Loop through pages of listings to get property links
    for x in range(1,n_pages+1):
        driver.get(f"{city_link}{x}/")
        link_elements = driver.find_elements(By.CLASS_NAME,'property-link')
        property_links = [x.get_attribute("href") for x in link_elements]
        all_property_links.extend(property_links)

In [None]:
# Drop dupes
all_property_links = list(set(all_property_links))
# Loop through properties
for i in tqdm(range(len(all_property_links))):
    link = all_property_links[i]
    all_property_info[link] = get_single_property_info(driver,link)

In [None]:
df_wa_property_info = pd.DataFrame(all_property_info).T.reset_index(names="url")
df_wa_property_info['built_year'] = df_wa_property_info['built_date'].apply(lambda x: int(x[-4:]) if x!=None and x[-4:].isnumeric() else None)
df_wa_property_info['units'] = df_wa_property_info['unit_count'].apply(lambda x: int(x.split(" units")[0]) if x!= None and " units" in x and x.split(" units")[0].isnumeric() else None)
df_wa_property_info['stories'] = df_wa_property_info['unit_count'].apply(lambda x: int(x[x.find("/")+1:].split(" stories")[0]) if x!=None and " stories" in x else None)
print(df_wa_property_info.shape)
df_wa_property_info.to_csv("data/wa_apartments_dot_com_listings.csv",index=False)
df_wa_property_info.head()

In [None]:
df_wa_property_info[df_wa_property_info['units'].isna()].shape

In [None]:
df_wa_property_info['built_year'].describe().astype(int)

In [None]:
df_wa_property_info['stories'].value_counts().sort_index().head(15)

In [None]:
wa_unit_count_freqs = pd.DataFrame(df_wa_property_info['units'].value_counts().sort_index()).reset_index()
wa_unit_count_freqs.columns = ['units','buildings']
(ggplot(wa_unit_count_freqs.query('units<400&units>0'),aes(x='units',y='buildings')) + 
geom_bar(stat='identity') +
#scale_x_continuous(breaks=list(range(0,30,2))) +
labs(x='Units per Apartment Building',y='Buildings',
    title = 'Distribution of units per building: WA'))

In [None]:
wa_unit_count_freqs.sort_values("buildings",ascending=False).head(10)

# Single City Testing

In [None]:
# base_url = "http://apartments.com/seattle-wa"
# driver.get(base_url)

In [None]:
#search_bar = driver.find_element(By.ID,"quickSearchLookup")

In [None]:
# page_range = driver.find_element(By.CLASS_NAME,"pageRange").text
# n_pages = int(page_range.split()[-1])

In [None]:
# all_links = []
# for x in range(1,n_pages+1):
#     driver.get(f"{base_url}/{x}/")
#     link_elements = driver.find_elements(By.CLASS_NAME,'property-link')
#     links = [x.get_attribute("href") for x in link_elements]
#     all_links.extend(links)

In [None]:
# built_date = driver.find_element(By.XPATH,"//div[contains( text( ), 'Built in')]").text
# built_date

In [None]:
# unit_counts = driver.find_elements(By.XPATH,"//div[contains( text( ), 'unit')]")
# unit_count = [x.text for x in unit_counts if "available" not in x.text.lower() and 'unit' in x.text.lower()][0]
# unit_count

In [None]:
# address_elem = driver.find_elements(By.CLASS_NAME,"propertyAddressContainer")
# address,neighborhood = address_elem[0].text.split("\n")
# print(address)
# print(neighborhood)

In [None]:
#driver.find_element(By.CLASS_NAME,'propertyName').get_attribute("innerHTML")