# Webscraping small project

The aim of this notebook is to demonstrate simply how to use selenium to get content from a webpage.

### Prerequisite:
- webdriver-manager needs to be installed 
- a specific driver ( eg: geckodriver for firefox needs to be download and accessible by webdriver-manager)

## Importing libraries

In [61]:
import sys
import csv
import time
import pandas as pd
import numpy as np
import selenium
from selenium import webdriver
import time

In [62]:
selenium.__version__

'3.141.0'

## Settings

In [63]:
# home page airbnb
url = " https://www.airbnb.com/"

## Loading driver with url

In [64]:
# import the webdriver
driver = webdriver.Firefox()
driver.get(url)
time.sleep(5) # give time to init driver

## (1) Extract the home page info, write it in a csv

In [65]:
from selenium.webdriver.common.by import By

container = driver.find_element(By.ID, "site-content")

# Fetching the div tags with data-testid id correspon to listing-card-title ( where titles of content cards are contained)
titles = container.find_elements(By.CSS_SELECTOR, "div[data-testid='listing-card-title']")
tmp_subtitles = container.find_elements(By.CSS_SELECTOR, "div[data-testid='listing-card-subtitle']")

subtitles= np.reshape(tmp_subtitles, (-1, 2))

# Fetching the prices elements ( need more work to retrieve the price inside the elements)
price_elements = container.find_elements(By.CSS_SELECTOR, "div[data-testid='price-availability-row']")

price_list = []
for p in price_elements:
    # retrieve price as text in the price element
    price_text = p.find_element(By.CLASS_NAME, "_1y74zjx").text
    price_list.append(price_text)
    

print(f'Size titles : {len(titles)}')
print(f"Size prices : {len(price_list)}")



Size titles : 20
Size prices : 20


In [66]:
# Print all rows from column 0
print(subtitles[:,0])

[<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="c7f49465-2602-4f0f-a95d-b0a25df7e6e4", element="99b25b53-b70d-4b25-bba4-5b829a8764c1")>
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="c7f49465-2602-4f0f-a95d-b0a25df7e6e4", element="518b938b-cfa6-41eb-bf97-b0fb4642ef1b")>
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="c7f49465-2602-4f0f-a95d-b0a25df7e6e4", element="b98f3b99-8e1a-4bfc-86a8-dff5f85610ea")>
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="c7f49465-2602-4f0f-a95d-b0a25df7e6e4", element="a7fc70c2-7a46-4848-bcfd-ec84c0b8e8ef")>
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="c7f49465-2602-4f0f-a95d-b0a25df7e6e4", element="695d6826-1c3d-4872-8afd-ed7f38670156")>
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="c7f49465-2602-4f0f-a95d-b0a25df7e6e4", element="c45e2980-bf82-44c6-9a27-b5af37427aae")>
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (sessi

In [67]:
data = {'Title': titles,
        'Price': price_list,
        'Distance' : subtitles[:,0],
        'Dates' : subtitles[:,1]}

df = pd.DataFrame(data)
df['Title'] =  df['Title'].transform(lambda x: x.text)
df['Distance'] =  df['Distance'].transform(lambda x: x.text)
df['Dates'] =  df['Dates'].transform(lambda x: x.text)

In [68]:
driver.quit()

In [69]:
df.head()

Unnamed: 0,Title,Price,Distance,Dates
0,"Claix, France",€ 182,6 kilometers away\n6 kilometers away,May 12 – 17\nMay 12 – 17
1,"Lumbin, France",€ 78,22 kilometers away\n22 kilometers away,May 1 – 6\nMay 1 – 6
2,"Charvieu-Chavagneux, France",€ 180,77 kilometers away\n77 kilometers away,May 19 – 24\nMay 19 – 24
3,"Saint-Alban-Leysse, France",€ 566,50 kilometers away\n50 kilometers away,May 10 – 15\nMay 10 – 15
4,"Sévrier, France",€ 317,83 kilometers away\n83 kilometers away,May 1 – 6\nMay 1 – 6


### Formatting the columns

We want :
- Clean `Distance` columns,
- Clean and split `Dates` column into 2 columns : `Date_Checkin`, `Date_Checkout`.

In [None]:
#### Formatting `Distance`

In [126]:
df['Distance_km'] = df['Distance'].str.replace(r' .*\n.*', '') # only keep number

  df['Distance_km'] = df['Distance'].str.replace(r' .*\n.*', '') # only keep number


In [130]:
df.head(30)

Unnamed: 0,Title,Price,Distance,Dates,Distance2,"(20,)",Distance_km
0,"Claix, France",€ 182,6,May 12 – 17,6 kilometers away,6 kilometers away,6
1,"Lumbin, France",€ 78,22,May 1 – 6,22 kilometers away,22 kilometers away,22
2,"Charvieu-Chavagneux, France",€ 180,77,May 19 – 24,77 kilometers away,77 kilometers away,77
3,"Saint-Alban-Leysse, France",€ 566,50,May 10 – 15,50 kilometers away,50 kilometers away,50
4,"Sévrier, France",€ 317,83,May 1 – 6,83 kilometers away,83 kilometers away,83
5,"Lathuile, France",€ 651,80,May 1 – 6,80 kilometers away,80 kilometers away,80
6,"Verrens-Arvey, France",€ 162,71,Dec 1 – 6,71 kilometers away,71 kilometers away,71
7,"Novalaise, France",€ 236,44,Jun 30 – Jul 5,44 kilometers away,44 kilometers away,44
8,"Saint-Jean-de-Moirans, France",€ 63,22,Sep 1 – 6,22 kilometers away,22 kilometers away,22
9,"Saint-Jean-de-Moirans, France",€ 96,22,Sep 16 – 21,22 kilometers away,22 kilometers away,22


#### Formating `Dates`

In [128]:
df['Dates'] = df['Dates'].str.replace(r'\n.*', '') # remove duplicate line


  df['Dates'] = df['Dates'].str.replace(r'\n.*', '') # remove duplicate line


In [129]:
df['Dates'].head()

0    May 12 – 17
1      May 1 – 6
2    May 19 – 24
3    May 10 – 15
4      May 1 – 6
Name: Dates, dtype: object

In [160]:
df[['Date_checkin', 'Date_checkout']] = df['Dates'].str.split('– ', expand=True)

In [164]:
df.head(15)

Unnamed: 0,Title,Price,Distance,Dates,Distance2,"(20,)",Distance_km,Date_checkin,Date_checkout
0,"Claix, France",€ 182,6,May 12 – 17,6 kilometers away,6 kilometers away,6,May 12,May 12 17
1,"Lumbin, France",€ 78,22,May 1 – 6,22 kilometers away,22 kilometers away,22,May 1,May 1 6
2,"Charvieu-Chavagneux, France",€ 180,77,May 19 – 24,77 kilometers away,77 kilometers away,77,May 19,May 19 24
3,"Saint-Alban-Leysse, France",€ 566,50,May 10 – 15,50 kilometers away,50 kilometers away,50,May 10,May 10 15
4,"Sévrier, France",€ 317,83,May 1 – 6,83 kilometers away,83 kilometers away,83,May 1,May 1 6
5,"Lathuile, France",€ 651,80,May 1 – 6,80 kilometers away,80 kilometers away,80,May 1,May 1 6
6,"Verrens-Arvey, France",€ 162,71,Dec 1 – 6,71 kilometers away,71 kilometers away,71,Dec 1,Dec 1 6
7,"Novalaise, France",€ 236,44,Jun 30 – Jul 5,44 kilometers away,44 kilometers away,44,Jun 30,Jul 5
8,"Saint-Jean-de-Moirans, France",€ 63,22,Sep 1 – 6,22 kilometers away,22 kilometers away,22,Sep 1,Sep 1 6
9,"Saint-Jean-de-Moirans, France",€ 96,22,Sep 16 – 21,22 kilometers away,22 kilometers away,22,Sep 16,Sep 16 21


In [162]:
import string
def add_month_if_not_found(df):
    if df.Date_checkout.startswith(tuple(string.digits)):
        prefix = df.Date_checkin.replace(r' .*', ' ')
        return prefix + df['Date_checkout']
    return df['Date_checkout']

In [163]:
df['Date_checkout'] = df.apply( add_month_if_not_found, axis=1)

In [159]:
df.head()

Unnamed: 0,Title,Price,Distance,Dates,Distance2,"(20,)",Distance_km,Date_checkin,Date_checkout
0,"Claix, France",€ 182,6,May 12 – 17,6 kilometers away,6 kilometers away,6,May 12,May 12 17
1,"Lumbin, France",€ 78,22,May 1 – 6,22 kilometers away,22 kilometers away,22,May 1,May 1 6
2,"Charvieu-Chavagneux, France",€ 180,77,May 19 – 24,77 kilometers away,77 kilometers away,77,May 19,May 19 24
3,"Saint-Alban-Leysse, France",€ 566,50,May 10 – 15,50 kilometers away,50 kilometers away,50,May 10,May 10 15
4,"Sévrier, France",€ 317,83,May 1 – 6,83 kilometers away,83 kilometers away,83,May 1,May 1 6


In [70]:
# TODO : Format
# TODO : Write in csv