Reference:
https://github.com/Vincent-Cui/flights_checker/blob/master/flight_checker_v2.0..py

Check Kayak for flight ticket, economy class, one-way

# Import Libraries

In [None]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
import time
import datetime

import threading
import tqdm

# Self-Defined Functions

FLIGHT INFORMATION:

新加坡-广州往返（TR100/TR101）每周日执飞

新加坡-香港往返（TR980/TR981）每天执飞

新加坡-澳门往返（TR904/TR905）每周一、三、六执飞

新加坡-天津往返（TR138/TR139）每周三执飞

新加坡-武汉往返（TR120/TR121）每周四执飞

新加坡-郑州往返（TR116/TR117）每周二执飞

新加坡-上海往返（SQ833/SQ830）每周一执飞

新加坡-重庆往返（SQ815/SQ814）每周一执飞

新加坡-深圳往返（SQ847/SQ846）每周日执飞

新加坡-上海往返（MU 567/568）每周四执飞

新加坡-西安（MU2070）

one way from SIN to CAN, one adult, date, economy
https://www.kayak.sg/flights/SIN-CAN/2022-04-05?sort=bestflight_a

one way from SIN to CAN, two adults, date, economy
https://www.kayak.sg/flights/SIN-CAN/2022-04-05/2adults?sort=bestflight_a

one way from SIN to CAN, two adults, date, business
https://www.kayak.sg/flights/SIN-CAN/2022-04-05/business/2adults?sort=bestflight_a

one way from SIN to CAN, one adult, date +/- 3 flexible days, economy
https://www.kayak.sg/flights/SIN-CAN/2022-04-05-flexible?sort=bestflight_a

one way from SIN to CAN, one adult, date +/- 2 flexible days, economy
https://www.kayak.sg/flights/SIN-CAN/2022-04-05-flexible-2days?sort=bestflight_a

Need to enable chromedriver to use: go to cmd window,

cd ../../..
cd usr/local/bin  # chromedriver is stored here

xattr -d com.apple.quarantine chromedriver   # remove the restriction

In [None]:
def str_to_datetext(text, year='2022'):
    """Create date text (YYYY-MM-DD) from D-M format"""
    
    text_list = text.split('-')
    
    if len(text_list[0]) == 1:
        day = '0' + text_list[0]
    else:
        day = text_list[0]
    if len(text_list[1]) == 1:
        month = '0' + text_list[1]
    else:
        month = text_list[1]
        
    date_str = year + '-' + month + '-' + day
    return date_str


def convert_df_type(df_record):
    """Covert string to date type"""
    df_record['Date'] = pd.to_datetime(df_record['Date']).dt.date
    df_record['Dept Time'] = pd.to_datetime(df_record['Dept Time']).dt.time
    df_record['Arrv Time'] = pd.to_datetime(df_record['Arrv Time']).dt.time
    df_record = df_record.sort_values(by=['Date', 'Dept Time']).reset_index(drop=True)    
    return df_record


def create_checkdates(start, end):
    """Create check dates for +/- 3 days flexible
    start: YYYY-MM-DD
    end: YYYY-MM-DD
    """
    check_dates = []
    tmp = datetime.datetime.strptime(
        start, '%Y-%m-%d').date() + datetime.timedelta(days=3)
    while tmp + datetime.timedelta(days=3) <= datetime.datetime.strptime(
            end, '%Y-%m-%d').date():
        tmp = tmp + datetime.timedelta(days=7)
        check_dates.append(str(tmp))

    return check_dates

# Search Functions

## Single Date

In [None]:
#Searching function: single date
def Search_single(dept, arrv, date, save=False):
    """
    dept: depart fram
    arrv: arrive at
    date: YYYY-MM-DD
    """

    # Prepare data frame
    df_record = pd.DataFrame(columns=[
        'Date', 'Dept Time', 'Arrv Time', 'Dept', 'Arrv', 'Stops', 'Duration',
        'Airline', 'Price'
    ])

    # Chrom driver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--window-size=1280x1696')
    options.add_argument('--hide-scrollbars')
    options.add_argument('--ignore-certificate-errors')
    options.add_argument(
        'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    )
    driver = webdriver.Chrome(options=options)

    # Prepare url
    url = 'https://www.kayak.sg/flights/'
    url1 = url + dept + '-' + arrv + '/' + date + '?sort=bestflight_a'
    driver.get(url1)
    time.sleep(5)

    # Result
    source_code = driver.page_source
    driver.quit()
    bs = BeautifulSoup(source_code, 'html.parser')
    content = bs.find_all('div', class_='inner-grid keel-grid')

    for tag in content:
        flight = tag.find('div', class_='col-info result-column')

        info = flight.find('div', class_='container')
        dept_time = info.find('span', class_='depart-time base-time').text
        arrv_time = info.find('span', class_='arrival-time base-time').text
        airports = info.find_all('div', class_='bottom-airport')
        airport_dept = airports[0].find('span',
                                        class_='airport-name').text.replace(
                                            '\n', ' ').strip()
        airport_arrv = airports[1].find('span',
                                        class_='airport-name').text.replace(
                                            '\n', ' ').strip()
        duration = info.find(
            'div', class_='section duration allow-multi-modal-icons').find(
                'div', class_='top').text.replace('\n', '').strip()
        airlines = flight.find('span', class_='codeshares-airline-names').text
        airprice = tag.find('div', class_='booking').find(
            'span', class_='price-text').text.replace('\nS$\xa0', '').strip()

        # number of stops
        stops = info.find('div', class_='section stops')
        try:
            num_stop = stops.find(
                'span',
                class_='stops-text with-warning').text.replace('\n',
                                                               '').strip()
        except:
            num_stop = stops.find('span', class_='stops-text').text.replace(
                '\n', '').strip()

        # Fill in table
        df_record = df_record.append(
            {
                'Date': date,
                'Dept Time': dept_time,
                'Arrv Time': arrv_time,
                'Dept': airport_dept,
                'Arrv': airport_arrv,
                'Stops': num_stop,
                'Duration': duration,
                'Airline': airlines,
                'Price': airprice
            },
            ignore_index=True)
        
        # Save
        if save == True:
            df_record.to_csv(f'df_record_{arrv}.csv')
    return df_record

## Flexible +/- 3 Days

In [None]:
#Searching function: single date +/- 3 days flexible
def Search_range3(dept, arrv, date, save=False):
    """
    dept: depart fram
    arrv: arrive at
    date: YYYY-MM-DD
    """

    # Prepare data frame
    df_record = pd.DataFrame(columns=[
        'Date', 'Dept Time', 'Arrv Time', 'Dept', 'Arrv', 'Stops', 'Duration',
        'Airline', 'Price'
    ])

    # Chrom driver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--window-size=1280x1696')
    options.add_argument('--hide-scrollbars')
    options.add_argument('--ignore-certificate-errors')
    options.add_argument(
        'user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    )
    driver = webdriver.Chrome(options=options)

    # Prepare url
    url = 'https://www.kayak.sg/flights/'
    url1 = url + dept + '-' + arrv + '/' + date + '-flexible?sort=bestflight_a'
    driver.get(url1)
    time.sleep(5)

    # Result
    source_code = driver.page_source
    driver.quit()
    bs = BeautifulSoup(source_code, 'html.parser')
    content = bs.find_all('div', class_='inner-grid keel-grid')

    for tag in content:
        flight = tag.find('div', class_='col-info result-column')

        info = flight.find('div', class_='container')
        dept_time = info.find('span', class_='depart-time base-time').text
        arrv_time = info.find('span', class_='arrival-time base-time').text
        airports = info.find_all('div', class_='bottom-airport')
        airport_dept = airports[0].find('span',
                                        class_='airport-name').text.replace(
                                            '\n', ' ').strip()
        airport_arrv = airports[1].find('span',
                                        class_='airport-name').text.replace(
                                            '\n', ' ').strip()
        duration = info.find(
            'div', class_='section duration allow-multi-modal-icons').find(
                'div', class_='top').text.replace('\n', '').strip()
        airlines = flight.find('span', class_='codeshares-airline-names').text

        try:
            airprice = tag.find('div', class_='booking').find(
                'span', class_='price-text').text.replace('\n', '').replace('S$\xa0', '').strip()
        except:
            airprice = tag.find('div', class_='booking').find(
                'div', class_='buzzPrice').text.replace('\n', '').replace('S$\xa0', '').strip()

        # Date
        try:
            airdate = info.find('div', class_='section carrier with-date').find('span', class_='flag').text
        except:
            if info.find('div', class_='section stacked-carriers with-date') is None:
                airdate = info.find('div', class_='section carrier with-date').find('span').text
            else:
                airdate = info.find('div', class_='section stacked-carriers with-date').find('span').text


        # number of stops
        stops = info.find('div', class_='section stops')
        try:
            num_stop = stops.find(
                'span',
                class_='stops-text with-warning').text.replace('\n',
                                                               '').strip()
        except:
            num_stop = stops.find('span', class_='stops-text').text.replace(
                '\n', '').strip()

        # Fill in table
        df_record = df_record.append(
            {
                'Date': str_to_datetext(airdate, '2022'),
                'Dept Time': dept_time,
                'Arrv Time': arrv_time,
                'Dept': airport_dept,
                'Arrv': airport_arrv,
                'Stops': num_stop,
                'Duration': duration,
                'Airline': airlines,
                'Price': airprice
            },
            ignore_index=True)
        
        # Save
        if save == True:
            df_record.to_csv(f'df_record_{arrv}.csv')
        
    return df_record

## Range Start - End

In [None]:
def Search_period(dept, arrv, start_date, end_date, save=False):
    check_dates = create_checkdates(start_date, end_date)
    
    result = pd.DataFrame()
    for date in tqdm.tqdm(check_dates):
        print(f'Check Date: {date}')
        tmp = pd.DataFrame()
        while tmp.shape[0] < 2:
            tmp = Search_range3(dept, arrv, date)
            time.sleep(8)
            
        result = pd.concat([result, tmp], ignore_index= True)
        
    # Save
    if save == True:
        result.to_csv(f'df_record_{arrv}.csv')
    
    return result

# Run

In [None]:
def run_all(dept, arrv_list, start_date, end_date, save=False):
    thread_list = []
    for i in arrv_list:
        print(i)
        thread_list.append(
            threading.Thread(target=Search_period(dept='SIN',
                                                  arrv=i,
                                                  start_date=start_date,
                                                  end_date=end_date,
                                                  save=save),
                             name=f'{i}Thread'))

    #add other threads here as well
    print('-------------Run------------')
    idx = 0
    for th in thread_list:
        print(arrv_list[idx])
        th.start()
        idx += 1
        
    # Combine as whole
    final = pd.DataFrame()

    for i in arrv_list:
        tmp = pd.read_csv(f'df_record_{i}.csv')
        final = pd.concat([final, tmp], ignore_index=True)

    final.drop(columns='Unnamed: 0', inplace=True)
    final = convert_df_type(final)
    
    return final

In [None]:
arrv_list = ['CAN', 'CTU', 'CKG', 'SHA', 'BJS', 'XMN', 'CGO', 'KHN', 'TSN', 'WUH', 'SZX', 'XIY']
dept = 'SIN'

In [None]:
start_date, end_date = '2022-04-10', '2022-05-01'
final_04 = run_all(dept, arrv_list, start_date, end_date, save=True)

In [None]:
time.sleep(30)
start_date, end_date = '2022-05-01', '2022-06-01'
final_05 = run_all(dept, arrv_list, start_date, end_date, save=False)

In [None]:
time.sleep(30)
start_date, end_date = '2022-06-01', '2022-07-01'
final_06 = run_all(dept, arrv_list, start_date, end_date, save=False)

In [None]:
time.sleep(30)
start_date, end_date = '2022-07-01', '2022-08-01'
final_07 = run_all(dept, arrv_list, start_date, end_date, save=False)