In [1]:
%pip install beautifulsoup4
%pip install pandas
%pip install webdriver-manager
%pip install selenium
%pip install multiprocess

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as BS
from multiprocessing import Process
import pandas as pd
from urllib.parse import urlparse, urlsplit
import time
import requests

In [3]:
#Ski Homes

ski_homes = [
    {
        'loc': 'breckinridge',
        'url': 'https://www.airbnb.com/s/Breckenridge--CO/homes?adults=1&place_id=ChIJwecmbD32aocReqKAZn-PjWI&refinement_paths%5B%5D=%2Fhomes'
    }, 
    {
        'loc': 'parkcity',
        'url': 'https://www.airbnb.com/s/Park-City--UT/homes?adults=1&place_id=ChIJ_QNjLGMPUocRlFc3Jd_Ecdg&refinement_paths%5B%5D=%2Fhomes'
    }, 
    {
        'loc': 'jacksonhole',
        'url': 'https://www.airbnb.com/s/Jackson-Hole--WY/homes?adults=1&place_id=ChIJS3_P_FgaU1MRXIM6scsBHD0&refinement_paths%5B%5D=%2Fhomes'
    }, 
    {
        'loc': 'vail',
        'url': 'https://www.airbnb.com/s/Vail--CO/homes?adults=1&place_id=ChIJB89dUQVwaocRxKOafh_AzMk&refinement_paths%5B%5D=%2Fhomes'
    },
    {
        'loc': 'steamboat',
        'url': 'https://www.airbnb.com/s/Steamboat-Springs--CO/homes?adults=1&place_id=ChIJYUZWCYF7QocRfc9uSNGjqBs&refinement_paths%5B%5D=%2Fhomes'
    },
    {
        'loc': 'bigsky',
        'url': 'https://www.airbnb.com/s/Big-Sky--MT/homes?adults=1&place_id=ChIJNSw3_WUOUFMRyiuLqjtx-JU&refinement_paths%5B%5D=%2Fhomes'
    },
    {
        'loc': 'telluride',
        'url': 'https://www.airbnb.com/s/Telluride--CO/homes?adults=1&place_id=ChIJc_TmcHvYPocR4eO6cSF37jg&refinement_paths%5B%5D=%2Fhomes'
    },
    {
        'loc': 'aspen',
        'url': 'https://www.airbnb.com/s/Aspen--CO/homes?adults=1&place_id=ChIJfTxB93w5QIcRcvYseNxCK8E&refinement_paths%5B%5D=%2Fhomes'
    },
    {
        'loc': 'tahoe',
        'url': 'https://www.airbnb.com/s/Lake-Tahoe/homes?adults=1&place_id=ChIJUREfuaF4mYARILWv7q8fP4w&refinement_paths%5B%5D=%2Fhomes'
    },
    {
        'loc': 'tahoe',
        'url': 'https://www.airbnb.com/s/Taos--NM/homes?adults=1&place_id=ChIJsfwRf9pkF4cRgrepYYOR6pA&refinement_paths%5B%5D=%2Fhomes'
    }
]



In [4]:
"""Scrape HTML from page"""

def scrape_page(page_url):
    
    answer = requests.get(page_url)
    content = answer.content
    soup = BS(content, features='html.parser')
    
    return soup


In [5]:
""" Children functions that scrape """

def getListingName(soup):
    try: 
        return soup.find("span", {"class": "_1n81at5"}).get_text()
    except:
        return None

def getListingRating(soup):
    try: 
        text = soup.find("span", {"class": "_17p6nbba"}).get_text()
        num = text.split(' ')[0]
        return float(num)
    except:
        return None

def getNumberReviews(soup):
    try:
        txt = soup.find("span", {"class": "_s65ijh7"}).get_text()
        num_reviews = []
        for el in txt:
            if (el == ' '):
                break

            num_reviews.append(el)

        return float(''.join(num_reviews))
    except:
        return None


def isSuperhost(soup):
    try:
        output = soup.find("span", {"class": "_1mhorg9"}).get_text() == 'Superhost'
    except:
        output = False
    return output

def isNewListing(soup):
    try:
        output = soup.find("span", {"class": "_1mhorg9"}).get_text() == 'New'
    except:
        output = False
    return output

def getLocationData(soup):
    try: 
        [city, state, country] = soup.find("span", {"class": "_9xiloll"}).get_text().split(', ')
        return [city, state, country]
    except:
        return [None, None, None]

def makeListingPriceFloat(string):
        price_chars = []
        for char in string:
            isBadChar = char == '$' or char == ','
            if (not isBadChar):
                price_chars.append(char)
        
        return float(''.join(price_chars))

def getPriceOfElement(soup, class_name):
    try:
        price_txt = soup.find("span", {"class": class_name}).get_text()
        return makeListingPriceFloat(price_txt)
    except:
        return None

def extractFromElemArray(array, fn):
    for elem in array:
        fn(elem)
        
def labelChecker(txt):
    string = txt.get_text()
    if 'night' in string:
        return 'multi_day_deal'

    if 'Cleaning' in string:
        return 'cleaning_fee'

    if 'service' in string:
        return 'airbnb_service_fee'
    return None

def getPricesFromMiniGuide(soup):
    output = {
        "multi_day_deal" : None, 
        "cleaning_fee" : None,
        "airbnb_service_fee" : None
    }
    
    try: 
        all_prices = soup.find("div", {"data-section-id": "BOOK_IT_SIDEBAR"}).findAll("span", {"class":"_1k4xcdh"})
        all_labels = soup.find("div", {"data-section-id": "BOOK_IT_SIDEBAR"}).findAll("div", {"class":"_m6lwl6"})
        formatted_labels = list(map(labelChecker, all_labels))
        
        for index, elem in enumerate(all_prices):
            if(elem == None):
                continue
            float_price = makeListingPriceFloat(elem.get_text())
            
            if(formatted_labels[index] != None):
                output[formatted_labels[index]] = float_price
        return output
    except:
        return output

def getTopRatingCategories(soup):
    top_categories = []
    raw_text= soup.findAll("div", {"class":"_y1ba89"})
    for elem in raw_text:
        top_categories.append(elem.get_text())
    return top_categories
    
def getOrigPricePerNight(soup):
    try: 
        return getPriceOfElement(soup, "_1ks8cgb")
    except: 
        return None

def getReducedPricePerNight(soup):
    try: 
        return getPriceOfElement(soup, "_1y74zjx")
    except: 
        return None

def getNightlyPrice(soup):
    try: 
        return getPriceOfElement(soup, "_tyxjp1")
    except:
        return None

        
def getNumberFromString(sentence):
    items = sentence.split(' ')
    
    for item in items:
        if (item.isnumeric()):
            return float(item)
        if(len(item) > 3):
            return item
        
    return None
            


def getSleepingArrangement(soup):
    output = {
        'listing_size': None,
        'num_guests': None,
        'num_bedrooms': None,
        'num_bathrooms': None
    }
    try:
        all_items = soup.find("div", {"class": "_tqmy57"}).findAll("li", {"class": "l7n4lsf"})
        sleeping_arrangment = []
        for index, elem in enumerate(all_items):
            item = elem.get_text()
            
            if index == 1:
                output['listing_size'] = getNumberFromString(item)
                continue
                
            if 'guests' in item:
                output['num_guests'] = getNumberFromString(item)
                
                
            if 'bed' in item:
                output['num_bedrooms'] = getNumberFromString(item)
                
            if 'bath' in item:
                output['num_bathrooms'] = getNumberFromString(item)

        return output
    except:
        return output

def getAmmenities(soup):
    try:
        elements = soup.findAll("div", {"class": "t1dx2edb"})
        output = []
        for elem in elements:
            output.append(elem.get_text())
        return output
    except:
        return []

def hasHotTub(array):
    try:
        output = False 
        for item in array:
            if item.find("hot") and item.find("tub"):
                output = True
        return output
    except:
        return False

def isSkiInSkiOut(array):
    try:
        output = False 
        for item in array:
            if item.find("Ski-in/ski-out"):
                output = True
        return output
    except:
        return False

In [6]:
""" Parent function to scrape all diff items from page """
def extract_listing(page_url):
    
    page_soup = scrape_page(page_url)
    listings = page_soup.findAll("div", {"class": "_8s3ctt"})

    return listings

In [7]:
""" Builds urls for defined data set """

def buildGridPageUrls(url, listings_per_page=20, pages_per_location=15):
    """Builds all search pages for a given location... 15 search pages, 20 items per page for 300 listings"""
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = url + f'&items_offset={offset}'
        url_list.append(url_pagination)
    return url_list


def getAllHomeLinks(soup):
    elem_list = soup.findAll("div", {"class": "c4mnd7m"})
    link_list = []
    airbnb_base_url = 'https://www.airbnb.com'
    for elem in elem_list:
        url = airbnb_base_url + elem.find("a", {"class":"l1j9v1wn"}).get("href")
        link_list.append(url)
    return link_list


In [8]:
def findNextPage(soup):
    try:
        nextpage = "https://airbnb.com" + soup.find("a", {"aria-label": "Next"}).get("href")
    except:
        nextpage = "no next page"
    return nextpage

    

In [9]:
def setupDriver(): 
    ChromeDriverManager().install()
    options = Options()
    options.page_load_strategy = 'normal'
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=options)
    return driver

In [10]:
def extractRentalInformation(url, page_wait = 3, click_wait = 3):
    driver = setupDriver()
    driver.get(url)

    time.sleep(page_wait)
    
    try:
        driver.find_elements(By.CLASS_NAME, "v7aged4")[0].click()
    except:
        pass # amenities button not found
    
    time.sleep(click_wait)

    details_page = driver.page_source
    driver.quit()

    soup = BS(details_page, features='html.parser')

    [city, state, country] = getLocationData(soup)
    multi_day_deal, cleaning_fee, airbnb_service_fee = getPricesFromMiniGuide(soup).values()
    listing_size, num_guests, num_bedrooms, num_bathrooms = getSleepingArrangement(soup).values()
    ammenities = getAmmenities(soup)
    reducedPricePerNight = getReducedPricePerNight(soup)
    origPricePerNight = getOrigPricePerNight(soup)
    chargePrice = getNightlyPrice(soup) or reducedPricePerNight or origPricePerNight
    
    dictionary = {
        "title": [getListingName(soup)], 
        "rating" : [getListingRating(soup)], 
        "nightlyRate" : [chargePrice],
        "city" : [city],
        "state" : [state],
        "country" : [country],
        "numberOfReviews": [getNumberReviews(soup)], 
        "originalPricePerNight": [origPricePerNight],
        "reducedPricePerNight" : [reducedPricePerNight],
        "listing_size": [listing_size],
        "num_guests" : [num_guests],
        "num_bedrooms": [num_bedrooms],
        "num_bathrooms": [num_bathrooms],
        "multiDayDeal": [multi_day_deal],
        "cleaningFee" : [cleaning_fee],
        "airbnbServiceFee" : [airbnb_service_fee],
        "newListing": [isNewListing(soup)], 
        "skiInSkiOut": [isSkiInSkiOut(ammenities)], 
        "superHost" : [isSuperhost(soup)], 
        "hotTub": [hasHotTub(ammenities)],
        "url" : [url]
    }
    
    return pd.DataFrame(dictionary)


In [11]:
def createAllListings(cityList):
    # for city in cityList:
    
    listings = []
    for city in cityList:
        allSearchUrls = buildGridPageUrls(city["url"])
        # for searchPage in allSearchUrls:
        for searchPage in allSearchUrls:
            listings.extend(getLinksForAllListingsInSearch(searchPage))
        
    return listings
    
def getLinksForAllListingsInSearch(url):
    driver = setupDriver()
    driver.get(url)
    time.sleep(2)
    details_page = driver.page_source

    driver.quit()

    soup = BS(details_page, features='html.parser')
    return getAllHomeLinks(soup)

def createDataFrame(listings):
    dataFrames = []
    for listing in listings:
        pageDF = extractRentalInformation(listing)
        dataFrames.append(pageDF)
        
    df = pd.concat(dataFrames, ignore_index = True)
    df.reset_index()
    return df




In [25]:
listings = createAllListings(ski_homes)

In [27]:
listings_df = pd.DataFrame(listings)
listings_df = listings_df.rename(columns = {0: 'cityUrl'})
listings_df.to_csv('listings.csv', index=False)

In [2]:

def extractAndCreateCsv(cityList):
    #create all listings
    listings = createAllListings(cityList)
    
    test = listings[:3]
    
    df = createDataFrame(test)
    # # return df.to_csv('intermediate_results_par.csv', mode='a', header=True, index=False)
    return df.to_csv('scrapedData.csv', mode='a', header=True, index=False)


extractAndCreateCsv(ski_homes)
