In [61]:
import time
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from sklearn.feature_extraction.text import CountVectorizer
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import re
import json
import pymongo
from pymongo import MongoClient

In [2]:
#Webdriver options
chrome_options = Options()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--verbose")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(" --disable-gpu")
chrome_options.add_argument(" --disable-infobars")
chrome_options.add_argument(" -–disable-web-security")
chrome_options.add_argument("--no-sandbox")

In [3]:
def setupDriver(url, waiting_time=2.5):
    ''' Initializes the driver of selenium'''
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get(url)
    driver.maximize_window()
    time.sleep(waiting_time)
    # accept Cookies
    element=driver.find_element_by_xpath('/html/body/div[6]/div/div/div[1]/section/footer/div[2]/button')
    print(element)
    element.click()
    return driver

In [5]:
def getPage(url):
    ''' returns a soup object that contains all the information
    of a certain webpage'''
    result = requests.get(url)
    content = result.content
    return BeautifulSoup(content, features="lxml")
bsPage= getPage("https://www.airbnb.fr/s/Paris/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=autocomplete_click&query=Paris&place_id=ChIJD7fiBh9u5kcRYJSMaMOCCwQ")
#bsPage

In [6]:
def getRoomClasses(soupPage):
    ''' returns all the listings that can
    be found on the page in a list.'''
    rooms = soupPage.findAll("div", {"class": "_8ssblpx"})
    result = []
    for room in rooms:
        result.append(room)
    return result

In [7]:
def getListingLink(listing):
    ''' returns the link of the listing'''
    return "http://airbnb.com" + listing.find("a")["href"]

In [8]:
def getListingTitle(listing):
    ''' This function returns the title of the listing'''
    return listing.find("meta")["content"]

In [9]:
def getTopRow(listing):
    ''' Returns the top row of listing information'''
    return listing.find("div", {"class": "_1tanv1h"}).text

In [10]:
def getRoomInfo(listing):
    ''' Returns the guest information'''
    return listing.find("div", {"class": "_kqh46o"}).text

In [11]:
def getBasicFacilities(listing):
    ''' Returns the basic facilities'''
    try:
        output = listing.findAll("div", {"class": "_kqh46o"})[1].text.replace(" ", "")
    except:
        output = []
    return output

In [12]:
def getListingPrice(listing):
    ''' Returns the price'''
    return listing.find("span", {"class": "_olc9rf0"}).text

In [13]:
def getListingRating(listing):
    ''' Returns the rating '''
    try:  # Not all listings have reviews // extraction failed
        output = listing.find("span", {"class": "_10fy1f8"}).text
    except:
        output = -1  # Indicate that the extraction failed -> can indicate no reviews or a mistake in scraping
    return output

In [14]:
def getListingReviewNumber(listing):
    ''' Returns the number of reviews '''
    try:  # Not all listings have reviews // extraction failed
        output = listing.findAll("span", {"class": "_a7a5sx"}).text
    except:
        output = -1  # Indicate that the extraction failed -> can indicate no reviews or a mistake in scraping
    return output

In [15]:
def extractInformation(soupPage):
    ''' Takes all the information of a single page (thus multiple listings) and
    summarizes it in a dataframe'''
    listings = getRoomClasses(soupPage)
    titles, links, toprows, roominfos, basicfacilitiess, prices, ratings, reviews = [], [], [], [], [], [], [], []
    for listing in listings:
        titles.append(getListingTitle(listing))
        links.append(getListingLink(listing))
        toprows.append(getTopRow(listing))
        roominfos.append(getRoomInfo(listing))
        basicfacilitiess.append(getBasicFacilities(listing))
        prices.append(getListingPrice(listing))
        ratings.append(getListingRating(listing))
        reviews.append(getListingReviewNumber(listing))
    dictionary = {"title": titles, "toprow": toprows, "roominfo": roominfos, "facilities": basicfacilitiess,
                  "price": prices, "rating": ratings, "link": links, "reviewnumber": reviews}
    return pd.DataFrame(dictionary)

In [16]:
def findNextPage(soupPage): 
    return soupPage.select("nav div._jro6t0 a._1y623pm")[-1].getText()
    if not(soupPage.find("div", {"class": "_jro6t0"}).find("button", {"class": "_za9j7e"})):
        return "https://airbnb.com" + soupPage.find("div", {"class": "_jro6t0"}).find("a")["href"]
    else:
        return "no next page"
parisURl= "https://www.airbnb.fr/s/Paris/homes?flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&adults=13&source=structured_search_input_header&search_type=pagination&tab_id=home_tab&checkin=2021-04-17&refinement_paths%5B%5D=%2Fhomes&date_picker_type=calendar&flexible_trip_lengths%5B%5D=one_month&checkout=2021-04-18&ne_lat=49.331444859190796&ne_lng=6.620213281250017&sw_lat=47.91548109370419&sw_lng=0.46786953125001673&zoom=7&search_by_map=true&place_id=ChIJD7fiBh9u5kcRYJSMaMOCCwQ&federated_search_session_id=b5ed7a5a-d074-4ef7-8263-2584a1a250df"
dt=findNextPage(bsPage)
dt


'15'

In [17]:
def getPages(url,driver):
    ''' This function returns all the links to the pages containing 
    listings for one particular city '''
    length = int(getPage(url).select("nav div._jro6t0 a._1y623pm")[-1].getText())
    element=driver.find_element_by_css_selector('a[aria-label="Suivant"]')
    result = []
    cmp=1
    while cmp < length: 
        try:
            el = WebDriverWait(driver, 120).until( \
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[aria-label="Suivant"]')))
            actions = ActionChains(driver)
            actions.move_to_element(el).perform()
            el.click()
            page = getPage(driver.current_url)
            result = result + [page]
            element = el
            cmp = cmp + 1
        except StaleElementReferenceException:
            break
    return result


In [18]:
def extractPages(url, driver):
    ''' This function outputs a dataframe that contains all information of a particular
    city. It thus contains information of multiple listings coming from multiple pages.'''
    pages = getPages(url, driver)
    df = extractInformation(pages[0])
    for pagenumber in range(1, len(pages)):
        df = df.append(extractInformation(pages[pagenumber]))
    return df

In [19]:
paris = "https://www.airbnb.fr/s/Paris/homes?flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&source=structured_search_input_header&search_type=pagination&tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_month&place_id=ChIJD7fiBh9u5kcRYJSMaMOCCwQ&federated_search_session_id=6a898d93-30d4-4c58-ad1c-a18df3cf01d2"
newyork = "https://www.airbnb.fr/s/New-York--%C3%89tat-de-New-York--%C3%89tats~Unis/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=filter_change&place_id=ChIJOwg_06VPwokRYv534QaPC8g&flexible_trip_lengths%5B%5D=one_month"
rome = "https://www.airbnb.fr/s/Rome--Italie/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=filter_change&place_id=ChIJu46S-ZZhLxMROG5lkwZ3D7k&flexible_trip_lengths%5B%5D=one_month"
londre = "https://www.airbnb.fr/s/londre/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=search_query&flexible_trip_lengths%5B%5D=one_month"
tokyo = "https://www.airbnb.fr/s/Tokyo--Japon/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=may&date_picker_type=flexible_dates&source=structured_search_input_header&search_type=autocomplete_click&flexible_trip_lengths%5B%5D=one_month&query=Tokyo%2C%20Japon&place_id=ChIJ51cu8IcbXWARiRtXIothAS4"

urls = [["Paris", paris], ["New York", newyork], ["Rome", rome], ["Londre", londre], ["Tokyo", tokyo]]

In [20]:
def scrapeURLs(listofURLs):
    ''' This function scrapes all listings of the cities listed in a list together
    with their URLs'''
    driver = setupDriver(listofURLs[0][1])
    WebDriverWait(driver, 120).until( \
            EC.presence_of_element_located((By.CSS_SELECTOR, 'header._6n0mzrr')))
    print(listofURLs[0][0]) 
    df = extractPages(listofURLs[0][1], driver)
    df.loc[:, "city"] = listofURLs[0][0] 
    for i in range(1, len(listofURLs)):
        print(listofURLs[i][0])
        driver.get(listofURLs[i][1])
        WebDriverWait(driver, 120).until( \
            EC.presence_of_element_located((By.CSS_SELECTOR, 'header._6n0mzrr')))
        newrows = extractPages(listofURLs[i][1], driver)
        newrows.loc[:, "city"] = listofURLs[i][0]
        df = df.append(newrows)
    return df

In [21]:
df = scrapeURLs(urls)
df

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [/Users/abdo17r/.wdm/drivers/chromedriver/mac64/89.0.4389.23/chromedriver] found in cache




<selenium.webdriver.remote.webelement.WebElement (session="be01ac967bcf1c3c9bd51a9542743e25", element="6ef76d5e-6fe6-45b3-b30a-dfcc289779ce")>
Paris
New York
Rome
Londre
Tokyo


Unnamed: 0,title,toprow,roominfo,facilities,price,rating,link,reviewnumber,city
0,Duplex d'artiste à une station de Paris. - nul...,Logement entier à Montreuil,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain,Wifi·Cuisine·Chauffage,867€,4.88,http://airbnb.com/rooms/2456539?adults=1&child...,-1,Paris
1,Studette à Paris 11 - bail de mobilité - null ...,Logement entier à 11e arrondissement,1 voyageur · Studio · 1 lit · 1 salle de bain,Wifi·Cuisine·Chauffage,627€,4.31,http://airbnb.com/rooms/13652841?adults=1&chil...,-1,Paris
2,chambre idéale télétravail Paris - null - Le K...,Chambre privée à 13e arrondissement,2 voyageurs · 1 chambre · 1 lit · 1 salle de b...,Wifi·Cuisine·Chauffage·Lave-linge,725€,-1,http://airbnb.com/rooms/48800755?adults=1&chil...,-1,Paris
3,#CosyStudio @ Paris betw. Opéra & Montmartre -...,Logement entier à 9e arrondissement,2 voyageurs · Studio · 1 lit · 1 salle de bain,Hôte:uneentreprise·Wifi·Cuisine·Chauffage,1 718€,4.83,http://airbnb.com/rooms/11310067?adults=1&chil...,-1,Paris
4,Un studio à deux pas de Trocadéro - null - Paris,Logement entier à 16e arrondissement,2 voyageurs · Studio · 1 lit · 1 salle de bain,Wifi·Cuisine·Chauffage·Lave-linge,777€,3.88,http://airbnb.com/rooms/45501057?adults=1&chil...,-1,Paris
...,...,...,...,...,...,...,...,...,...
15,H183_WalkToShinjuku!8mins!!Enjoy!TheBestLocati...,Logement entier à Shibuya City,2 voyageurs · 1 chambre · 1 lit · 1 salle de bain,Cuisine·Wifi·Climatisation·Sèche-cheveux,863€,4.60,http://airbnb.com/rooms/38203819?adults=1&chil...,-1,Tokyo
16,Amazing location!Private Apt in Shinjuku Kabuk...,Logement entier à Shinjuku,2 voyageurs · 1 chambre · 2 lits · 1 salle de ...,Cuisine·Wifi·Climatisation·Sèche-cheveux,714€,4.33,http://airbnb.com/rooms/12800094?adults=1&chil...,-1,Tokyo
17,★割引有月7万★Asakusa Sika Hotel no window twin room...,Chambre dans un boutique-hôtel à Taito City,"2 voyageurs · 1 chambre · 0 lit · 1,5 salle de...",Wifi·Climatisation·Sèche-cheveux,682€,-1,http://airbnb.com/rooms/43718335?adults=1&chil...,-1,Tokyo
18,"Mukojima 1F/Asakusa, Skytree/Japanese Modern/W...",Logement entier à Sumida City,"4 voyageurs · 1 chambre · 2 lits · 1,5 salle d...",Cuisine·Wifi·Climatisation·Sèche-cheveux,812€,-1,http://airbnb.com/rooms/39734961?adults=1&chil...,-1,Tokyo


In [22]:
def cleanTitle(df):
    df.loc[:, "name"] = df["title"].str.split(" null ", n = 0, expand = True)[0].str.replace("-", "")
    df.loc[:, "location"] = df["title"].str.split(" null ", n = 0, expand = True)[1].str.replace("-", "").str.strip()
    return df.drop("title", axis = 1)

In [24]:
def cleanTopRow(df):
    df.loc[:, 'roomtype'] = df["toprow"].str.split(" à ", n = 0, expand = True)[0] 
    df.loc[:, 'detailed_location'] = df["toprow"].str.split(" à ", n = 0, expand = True)[1] 
    return df.drop("toprow", axis = 1)

In [25]:
def cleanRoomInfo(df):
    df.loc[:, "guests"] = df.loc[:, "roominfo"].str.split(" · ", n = 0, expand = True)[0].str.replace(" voyageurs", "")
    df.loc[:, "bedrooms"] = df.loc[:, "roominfo"].str.split(" . ", n = 0, expand = True)[1]
    df.loc[:, "beds"] = df.loc[:, "roominfo"].str.split(" . ", n = 0, expand = True)[2].str.replace(" lit", "").str.replace("s", "")
    df.loc[:, "bathrooms"] = df.loc[:, "roominfo"].str.split(" . ", n = 0, expand = True)[3]
    df.loc[:, "guests"] = pd.to_numeric(df.guests, errors = 'coerce')
    df.loc[:, "beds"] = pd.to_numeric(df.beds, errors = 'coerce')
    df.loc[:, "bedrooms"] = pd.to_numeric(df.bedrooms.str.split(" ", n = 0, expand = True)[0], errors = "ignore")
    df.loc[:, "bathrooms"] = pd.to_numeric(df.bathrooms.str.split(" ", n = 0, expand = True)[0], errors = "ignore")
    return df.drop("roominfo", axis = 1)

In [147]:
def cleanPrice(df):
    df.loc[:, "pricepernight"] = df.loc[:, "price"].str.replace(" ", "").str.replace("€", "")
    df.loc[:, "pricepernight"] = pd.to_numeric(df.pricepernight, errors = 'coerce')
    return df.drop("pricepernight", axis = 1)


In [31]:
def cleanFacilities(df):
    df.loc[:, "facilities"] = df["facilities"].astype(str).str.replace("[","").str.replace("]","")
    vectorizer = CountVectorizer(decode_error = "ignore") 
    X = vectorizer.fit_transform(df.facilities)
    bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    return pd.concat([df.reset_index(drop=True).drop("facilities", axis = 1), bag_of_words], axis=1)

In [48]:
def cleanRating(df):
    df.loc[:, "score"] = df.loc[:, 'rating']
    df.loc[:, "score"] = pd.to_numeric(df.score, errors = "coerce")
    return df.drop("rating", axis = 1)

In [28]:
def cleanReviewNumber(df):
    df.loc[:, "reviewnumber"] = df.loc[:, 'reviewnumber'].str.split(" ", n = 0, expand = True)[0]
    df.loc[:, "reviewnumber"] = pd.to_numeric(df.reviewnumber, errors = "coerce")
    return df

In [150]:
def clean(df):
    df = cleanTitle(df)
    df = cleanFacilities(df)
    df = cleanTopRow(df)
    df = cleanRoomInfo(df)
    df = cleanPrice(df)
    df = cleanRating(df)
    #df = cleanReviewNumber(df)
    col1 = pd.to_numeric(df.pop('price').str.replace("€", ""), errors = "coerce")
    df = pd.concat([df.reset_index(drop=True), col1], axis=1)
    col2 = df.pop('reviewnumber')
    df = pd.concat([df.reset_index(drop=True), col2], axis=1) 
    col3 = df.pop('link')
    df = pd.concat([df.reset_index(drop=True), col3], axis=1) 
    return df

In [152]:
cleanedDf = clean(df)
cleanedDf.replace(np.nan, 0)

Unnamed: 0,city,name,location,chauffage,cheveux,climatisation,cuisine,hôte,lave,linge,...,roomtype,detailed_location,guests,bedrooms,beds,bathrooms,score,price,reviewnumber,link
0,Paris,Duplex d'artiste à une station de Paris.,Montreuil,1,0,0,1,0,0,0,...,Logement entier,Montreuil,2.0,1,1.0,1,4.88,867.0,-1,http://airbnb.com/rooms/2456539?adults=1&child...
1,Paris,Studette à Paris 11 bail de mobilité,Paris,1,0,0,1,0,0,0,...,Logement entier,11e arrondissement,0.0,Studio,1.0,1,4.31,627.0,-1,http://airbnb.com/rooms/13652841?adults=1&chil...
2,Paris,chambre idéale télétravail Paris,Le KremlinBicêtre,1,0,0,1,0,1,1,...,Chambre privée,13e arrondissement,2.0,1,1.0,1,-1.00,725.0,-1,http://airbnb.com/rooms/48800755?adults=1&chil...
3,Paris,#CosyStudio @ Paris betw. Opéra & Montmartre,Paris,1,0,0,1,1,0,0,...,Logement entier,9e arrondissement,2.0,Studio,1.0,1,4.83,0.0,-1,http://airbnb.com/rooms/11310067?adults=1&chil...
4,Paris,Un studio à deux pas de Trocadéro,Paris,1,0,0,1,0,1,1,...,Logement entier,16e arrondissement,2.0,Studio,1.0,1,3.88,777.0,-1,http://airbnb.com/rooms/45501057?adults=1&chil...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,Tokyo,H183_WalkToShinjuku!8mins!!Enjoy!TheBestLocati...,Shibuya City,0,1,1,1,0,0,0,...,Logement entier,Shibuya City,2.0,1,1.0,1,4.60,863.0,-1,http://airbnb.com/rooms/38203819?adults=1&chil...
1396,Tokyo,Amazing location!Private Apt in Shinjuku Kabuk...,Shinjuku,0,1,1,1,0,0,0,...,Logement entier,Shinjuku,2.0,1,2.0,1,4.33,714.0,-1,http://airbnb.com/rooms/12800094?adults=1&chil...
1397,Tokyo,★割引有月7万★Asakusa Sika Hotel no window twin room...,Taito City,0,1,1,0,0,0,0,...,Chambre dans un boutique-hôtel,Taito City,2.0,1,0.0,"1,5 salle",-1.00,682.0,-1,http://airbnb.com/rooms/43718335?adults=1&chil...
1398,Tokyo,"Mukojima 1F/Asakusa, Skytree/Japanese Modern/W...",Sumida City,0,1,1,1,0,0,0,...,Logement entier,Sumida City,4.0,1,2.0,"1,5 salle",-1.00,812.0,-1,http://airbnb.com/rooms/39734961?adults=1&chil...


In [153]:
mongo_client = MongoClient('localhost', 27017)

In [154]:
airbnb = mongo_client['airbnb']

In [155]:
records = cleanedDf.to_dict(orient='records')
#records

In [156]:
airbnb.collection.insert_many(records)

<pymongo.results.InsertManyResult at 0x7f7f7d14e870>

In [None]:
# Moyen du score par chaque ville
grp = {$group:{_id:"$city",rating:{$avg:"$score"}}}
tri = {$sort : {"_id" : -1}}
db.collection.aggregate([grp,tri])

In [None]:
# Moyen du score par chaque type de logement
grp = {$group:{_id:"$roomtype",rating:{$avg:"$score"}}}
tri = {$sort : {"_id" : -1}}
db.collection.aggregate([grp,tri])

In [None]:
# Moyen du score pour chaque ville et par chaque type de logement
grp = {$group:{_id:{city:"$city",roomtype:"$roomtype"},rating:{$avg:"$score"}}}
tri = {$sort : {"_id" : -1}}
db.collection.aggregate([grp,tri])

In [None]:
#Total type de logement par ville
db.collection.aggregate({$group:{_id:{city:"$city",roomType:"$roomtype"}}},{$group:{_id:"$_id.city",Type_nbr:{$sum:1}}})

In [None]:
# Moyen du prix pour chaque ville et par chaque type de logement
grp = {$group:{_id:{city:"$city",roomtype:"$roomtype"},avgPrice:{$avg:"$price"}}}
tri = {$sort : {"_id" : -1}}
db.collection.aggregate([grp,tri])

In [None]:
# Total de capacité pour chaque ville
db.collection.aggregate({$group:{_id:"$city",guests:{$sum:"$beds"}}})

city                  object
name                  object
location              object
chauffage              int64
cheveux                int64
climatisation          int64
cuisine                int64
hôte                   int64
lave                   int64
linge                  int64
sèche                  int64
uneentreprise          int64
wifi                   int64
roomtype              object
detailed_location     object
guests               float64
bedrooms              object
beds                 float64
bathrooms             object
score                float64
price                float64
reviewnumber           int64
link                  object
dtype: object