In [1]:
## Set up
from selenium import webdriver
from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains

import datetime
import time

import pandas as pd

# options = Options()
# options.add_experimental_option("detach", True)

# path = "C:\Program Files (x86)\chromedriver.exe"

# driver = webdriver.Chrome(path, options=options)

driver = webdriver.Chrome()
driver.implicitly_wait(5)

In [2]:
def page(driver=driver, url: str= "https://www.booking.com/"):
    driver.get(url)
    driver.maximize_window()

In [3]:
def accept_coockies(driver = driver, popup_name: str = "onetrust-accept-btn-handler") -> None:
    driver.find_element(By.ID, popup_name).click()

def manage_coockies(driver = driver, popup_name: str = "onetrust-pc-btn-handler") -> None:
    driver.find_element(By.ID, popup_name).click()
    driver.find_element(By.XPATH, "//button[text()='Confirm My Choices']").click()

def go_to_flights(driver = driver, name = "Flights"):
    driver.find_element(By.LINK_TEXT, name).click()

In [4]:
def input_from_city_airport(driver = driver, city: str = "brussels", airport: str = "AIRPORTBRU") -> None:
    driver.find_element(By.CSS_SELECTOR, "button[data-ui-name='input_location_from_segment_0']").click()

    # Clear bar to avoid unwanted input
    driver.find_element(By.CSS_SELECTOR, ".css-rh2lq6").click()

    # Input departure city
    fly_from = driver.find_element(By.CSS_SELECTOR, "input[data-ui-name='input_text_autocomplete']")
    fly_from.send_keys('  ' + city.lower())
    fly_from_airport = driver.find_element(By.CSS_SELECTOR, f"input[name='{airport.upper()}']")
    fly_from_airport.click()

In [5]:
def input_to_city_airport(driver = driver, city: str = "jakarta", airport: str = "AIRPORTCGK") -> None:
    driver.find_element(By.CSS_SELECTOR, "button[data-ui-name='input_location_to_segment_0']").click()
    fly_to = driver.find_element(By.CSS_SELECTOR, "input[data-ui-name='input_text_autocomplete']")
    fly_to.send_keys('  ' + city.lower())
    fly_to_airport = driver.find_element(By.CSS_SELECTOR, f"input[name='{airport.upper()}']")
    fly_to_airport.click()

In [6]:
def round_trip_date_chooser(driver=driver, css_from_date: str= 'span[data-date="2023-08-01"]', css_to_date: str='span[data-date="2023-08-15"]'):
    date_roundtrip_click = driver.find_element(By.CSS_SELECTOR, 'button[data-ui-name="button_date_segment_0"]')
    date_roundtrip_click.click()
    arrow = driver.find_element(By.CSS_SELECTOR, 'button[class="Actionable-module__root___lQCcK Button-module__root___puEjU Button-module__root--variant-tertiary___MhiYJ Button-module__root--icon-only___uhCdT Button-module__root--size-large___Oef9C Button-module__root--wide-false___ngEa+ Button-module__root--variant-tertiary-neutral___zGwxJ Calendar-module__control___5hEew Calendar-module__control--next___N9ipu"]')
    arrow.click()
    dep_date_button = driver.find_element(By.CSS_SELECTOR, css_from_date)
    dep_date_button.click()
    arr_date_button = driver.find_element(By.CSS_SELECTOR, css_to_date)
    arr_date_button.click()
    search_click = driver.find_element(By.CSS_SELECTOR, 'button[data-ui-name="button_search_submit"]')
    search_click.click()

In [7]:
def airline_chooser(driver = driver, airline_1: str= "Lufthansa", airline_2: str="Qatar Airways"):
    # time.sleep(2)
    # driver.execute_script("window.scrollTo(0,400)")
    time.sleep(2)
    ticket_type = driver.find_element(By.CSS_SELECTOR, ".Link-module__root--variant-primary___skVxI, .Link-module__root--variant-secondary___IJWjg")
    ticket_type.click()
    time.sleep(2)
    element_to_hover_over = driver.find_element(By.CSS_SELECTOR, "[data-testid='search_filter_airline_" + airline_1 + "']")
    hover = ActionChains(driver).move_to_element(element_to_hover_over)
    hover.perform()
    time.sleep(2)
    air_1 = driver.find_element(By.CSS_SELECTOR, 'button[class="Actionable-module__root___lQCcK Link-module__root___Jo24k Link-module__root--variant-primary___skVxI"]')
    air_1.click()
    air_2 = driver.find_element(By.CSS_SELECTOR, "[data-testid='search_filter_airline_" + airline_2 + "']" )
    air_2.click()

In [8]:
def scrape_card(card_number: int=0) -> dict:
    """
    Function that generates a dictionary of a specific flight-card (in this case round-trip ticket).

    Args:
    -----
        card_number (int): number of flight card interested in
                        (Default sets to 0)

    Returns:
    --------
        dict: dictionnary

    Comments:
    ---------
        flight-cards: the offred results obtain when searching flight tickets. 
                        A flight-card includes relevant information of a flight-ticket.
        
        card_number: always starts at 0, 
                    i.e., the number of the first flight-card is 0.
        
        dictionnary: it contains 18 values which are,

            airline_company : name of the airline company
            dep_city        : departure city (where the flight starts form)
            arr_city        : arrival city (the destination of the flight)
            out_dep_date    : departure day of the outbound flight
            out_dep_time    : departure time of the outbound flight
            out_duration    : lenght (duration) of the outbound flight
            out_stop_number : number of stopover(s) or layover(s)
            out_arr_date    : arrival day of the outbound flight
            out_arr_time    : arrival time of the outbound flight
            in_dep_date     : departure day of the inbound flight
            in_dep_time     : departure time of the inbound flight
            in_duration     : lenght of the inbound flight
            in_stop_num     : number of stopover(s) or layover(s)
            in_arr_date     : arrival day of the inbound flight
            in_arr_time     : arrival time of the inbound flight
            price_ticket    : flight price ticket

            hour_scrap     : hour in which the scrap has been done
            day_scrap      : day which the scrap has been performed
    """
    ticket = {}
    
    ticket['airline_company']= driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[3]/div/div/div'
        ).text
    
    ticket['dep_city'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[1]/div[2]/div/div/div[1]/div[2]/div[1]'
        ).text
    
    ticket['arr_city'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[1]/div[2]/div/div/div[3]/div[2]/div[1]'
        ).text
    
    ticket['out_dep_date'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[1]/div[2]/div/div/div[1]/div[2]/div[3]'
        ).text

    ticket['out_dep_time'] =  driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[1]/div[2]/div/div/div[1]/div[1]'
        ).text
    
    ticket['out_duration'] = driver.find_element(
        By.XPATH, f"//*[@id='flight-card-{card_number}']/div/div/div[1]/div[1]/div[2]/div/div/div[2]/div[1]"
        ).text
    
    ticket['out_stop_num'] = driver.find_element(
        By.XPATH, f"//*[@id='flight-card-{card_number}']/div/div/div[1]/div[1]/div[2]/div/div/div[2]/div[3]"
        ).text
    
    ticket['out_arr_date'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[1]/div[2]/div/div/div[3]/div[2]/div[3]'
        ).text
    
    ticket['out_arr_time'] = driver.find_element(
        By.XPATH, f"//*[@id='flight-card-{card_number}']/div/div/div[1]/div[1]/div[2]/div/div/div[3]/div[1]"
        ).text
    
    ticket['in_dep_date'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[4]/div[2]/div/div/div[1]/div[2]/div[3]'
        ).text
    
    ticket['in_dep_time'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[4]/div[2]/div/div/div[1]/div[1]'
        ).text
    
    ticket['in_duration'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[4]/div[2]/div/div/div[2]/div[1]'
        ).text
    
    ticket['in_stop_num'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[4]/div[2]/div/div/div[2]/div[3]'
        ).text

    ticket['in_arr_date'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[4]/div[2]/div/div/div[3]/div[2]/div[3]'
        ).text
    
    ticket['in_arr_time'] = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[1]/div[4]/div[2]/div/div/div[3]/div[1]'
        ).text
    
    see_flight = driver.find_element(
        By.XPATH, f'//*[@id="flight-card-{card_number}"]/div/div/div[2]/div[2]/button'
    ).click()

    container = driver.find_elements(By.CSS_SELECTOR, ".SheetContainer-module__content___WX8xI")
    ticket['price_ticket'] = container[0].find_element(
        By.CSS_SELECTOR, ".css-vxcmzt"
        ).text
    driver.back()


    current_time = datetime.datetime.now()

    ticket['hour_scrap'] = current_time.hour
    ticket['day_scrap'] = current_time.day
    
    return(ticket)

In [9]:
def scrape_cards(dict: list= [])->list:
    """
    Function that generates list of dictionaries of n flight-card(s) for one search result page.

    Args:
    -----
        n_cards (int): n first flight-card(s)
                            (Default sets to total number of flight-cards in one page)
    
    Returns:
    --------
        list: list of dictionnaries
    """
    n_cards = len(driver.find_elements(By.CSS_SELECTOR, ".css-4o3ibe"))

    for i in range(0, n_cards-1):
        dict.append(scrape_card(i))

    return(dict)

In [10]:
my_dict = []
def get_pages(dict: list=my_dict):
    tot_pages = len(driver.find_elements(By.CSS_SELECTOR, '.Pagination-module__link___XUrx7, .Pagination-module__link___XUrx7:link, .Pagination-module__link___XUrx7:visited, .Pagination-module__separator___hRwOo'))
    for i in range(1, tot_pages+1):
        print(f"Scraping page {i}")
        scrape_cards(dict)
        time.sleep(2)
        next_button = driver.find_element(By.CSS_SELECTOR, f'button[aria-label=" {i}"]')
        next_button.click()
        time.sleep(3)
    
    scrape_cards(dict)
    
    return(dict) 

In [11]:
page()
manage_coockies()
go_to_flights()
input_from_city_airport()
input_to_city_airport()
round_trip_date_chooser()

In [13]:
airline_chooser()

In [14]:
get_pages()

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6


[{'airline_company': 'Qatar Airways',
  'dep_city': 'BRU',
  'arr_city': 'CGK',
  'out_dep_date': 'Aug 1',
  'out_dep_time': '9:05 AM',
  'out_duration': '17h 35m',
  'out_stop_num': '1 stop',
  'out_arr_date': 'Aug 2',
  'out_arr_time': '7:40 AM',
  'in_dep_date': 'Aug 15',
  'in_dep_time': '9:05 AM',
  'in_duration': '34h 05m',
  'in_stop_num': '1 stop',
  'in_arr_date': 'Aug 16',
  'in_arr_time': '2:10 PM',
  'price_ticket': '1 705,46 €',
  'hour_scrap': 10,
  'day_scrap': 17},
 {'airline_company': 'Qatar Airways',
  'dep_city': 'BRU',
  'arr_city': 'CGK',
  'out_dep_date': 'Aug 1',
  'out_dep_time': '9:05 AM',
  'out_duration': '17h 35m',
  'out_stop_num': '1 stop',
  'out_arr_date': 'Aug 2',
  'out_arr_time': '7:40 AM',
  'in_dep_date': 'Aug 15',
  'in_dep_time': '7:00 PM',
  'in_duration': '18h 10m',
  'in_stop_num': '2 stops',
  'in_arr_date': 'Aug 16',
  'in_arr_time': '8:10 AM',
  'price_ticket': '2 334,50 €',
  'hour_scrap': 10,
  'day_scrap': 17},
 {'airline_company': 'Qatar

In [15]:
cur_time = datetime.datetime.now()
day = cur_time.day
hour = cur_time.strftime("%I")
hour_spe = cur_time.strftime("%p")

df = pd.DataFrame(my_dict)
df.to_csv(f'booking_{day}_may_{hour}_{hour_spe}.csv', index=False)
df.to_excel(f'booking_{day}_may_{hour}_{hour_spe}.xlsx', index=False)

In [16]:
import shutil

shutil.move(f'booking_{day}_may_{hour}_{hour_spe}.csv', 'Bxl_Jakarta')
shutil.move(f'booking_{day}_may_{hour}_{hour_spe}.xlsx', 'Bxl_Jakarta')

'Bxl_Jakarta\\booking_17_may_10_AM.xlsx'

In [17]:
go_to_flights()
input_from_city_airport()
input_to_city_airport(city="tokyo", airport='AIRPORTHND')
round_trip_date_chooser()

In [19]:
airline_chooser(airline_1= "Lufthansa", airline_2="Qatar Airways")

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[data-testid='search_filter_airline_Qatar Airways']"}
  (Session info: chrome=113.0.5672.94)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00BC8893+48451]
	(No symbol) [0x00B5B8A1]
	(No symbol) [0x00A65058]
	(No symbol) [0x00A90467]
	(No symbol) [0x00A9069B]
	(No symbol) [0x00ABDD92]
	(No symbol) [0x00AAA304]
	(No symbol) [0x00ABC482]
	(No symbol) [0x00AAA0B6]
	(No symbol) [0x00A87E08]
	(No symbol) [0x00A88F2D]
	GetHandleVerifier [0x00E28E3A+2540266]
	GetHandleVerifier [0x00E68959+2801161]
	GetHandleVerifier [0x00E6295C+2776588]
	GetHandleVerifier [0x00C52280+612144]
	(No symbol) [0x00B64F6C]
	(No symbol) [0x00B611D8]
	(No symbol) [0x00B612BB]
	(No symbol) [0x00B54857]
	BaseThreadInitThunk [0x75FC7D59+25]
	RtlInitializeExceptionChain [0x776BB74B+107]
	RtlClearBits [0x776BB6CF+191]


In [20]:
new_dict = []
get_pages(dict=new_dict)

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6


[{'airline_company': 'Lufthansa',
  'dep_city': 'BRU',
  'arr_city': 'HND',
  'out_dep_date': 'Aug 1',
  'out_dep_time': '11:20 AM',
  'out_duration': '15h 30m',
  'out_stop_num': '1 stop',
  'out_arr_date': 'Aug 2',
  'out_arr_time': '9:50 AM',
  'in_dep_date': 'Aug 15',
  'in_dep_time': '9:40 AM',
  'in_duration': '17h 25m',
  'in_stop_num': '1 stop',
  'in_arr_date': 'Aug 15',
  'in_arr_time': '8:05 PM',
  'price_ticket': '2 216,97 €',
  'hour_scrap': 10,
  'day_scrap': 17},
 {'airline_company': 'Lufthansa',
  'dep_city': 'BRU',
  'arr_city': 'HND',
  'out_dep_date': 'Aug 1',
  'out_dep_time': '8:50 AM',
  'out_duration': '16h 10m',
  'out_stop_num': '1 stop',
  'out_arr_date': 'Aug 2',
  'out_arr_time': '8:00 AM',
  'in_dep_date': 'Aug 15',
  'in_dep_time': '9:40 AM',
  'in_duration': '17h 25m',
  'in_stop_num': '1 stop',
  'in_arr_date': 'Aug 15',
  'in_arr_time': '8:05 PM',
  'price_ticket': '2 202,74 €',
  'hour_scrap': 10,
  'day_scrap': 17},
 {'airline_company': 'Lufthansa',
 

In [21]:
df1 = pd.DataFrame(new_dict)
df1.to_csv(f'booking_{day}_may_{hour}_{hour_spe}.csv', index=False)
df1.to_excel(f'booking_{day}_may_{hour}_{hour_spe}.xlsx', index=False)

In [22]:
shutil.move(f'booking_{day}_may_{hour}_{hour_spe}.csv', 'Bxl_Tokyo')
shutil.move(f'booking_{day}_may_{hour}_{hour_spe}.xlsx', 'Bxl_Tokyo')

'Bxl_Tokyo\\booking_17_may_10_AM.xlsx'