<a href="https://colab.research.google.com/github/akbarriki/scraping_tiketcom/blob/main/Scraping_Tiketcom_BestPrice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Scraping Best Price for Flights from Jakarta to Selected Airports in Indonesia**

##This dataset is collected from [Tiket.com](https://tiket.com), one of Online Travel Agent Platform in Indonesia

Author: [Riki Akbar](https://akbarriki.github.io)
<br />Last update: 31 December 2022<br />Dataset is now available [here](https://www.kaggle.com/datasets/datasciencerikiakbar/tiketcom-best-price-for-flights-from-jakarta)

Feel free to make any code adjustment but **please keep the attribution remains**


In [1]:
import pandas as pd
import numpy as np
import requests, json, time, math, os
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime as dt
from datetime import date
from calendar import monthrange
from google.colab import drive

#mounting google drive
drive.mount('/content/gdrive', force_remount=True)
project_folder = 'gdrive/MyDrive/scraping_projects/tiketcom'
output_file = 'tiketcom_bestprice.csv'
os.chdir(project_folder)
os.getcwd()

Mounted at /content/gdrive


'/content/gdrive/MyDrive/scraping_projects/tiketcom'

In [2]:
# define execution timestamp
exec_timestamp = dt.now()

# define all required variables
base_url = 'https://www.tiket.com/'

# target list
_data_ = []

num_adult, num_child, num_infant = 1, 0, 0
_ORIGIN_, _ORIGINTYPE_ = 'JKTC', 'CITY'
# _DESTINATION_, _DESTINATIONTYPE_ = 'BTJ', 'AIRPORT'
_CABINCLASS_ = 'ECONOMY'

target_year = date.today().year + 1

selecteddate = '2023-04-19'
startdate = '2023-04-01'
enddate = '2023-04-30'

_DESTINATIONLIST_ = ['BTJ','BDO','MLG','SRG','SUB','JOG','YIA','SOC','TKG','BTH',
                     'BKS','DJB','KNO','PDG','PLM','PGK','PKU','TNJ','BPN','BDJ',
                     'PNK','SRI','TRK','UPG','MDC','DPS','LOP','BIK','DJJ','MKQ']

for i, _DESTINATION_ in enumerate(_DESTINATIONLIST_):
  print(_DESTINATION_)
  for m in tqdm(range(6)):
        
    # define current month, number of days in current month, and default selected date (default selected date for april 2023 is 19)
    curmonth = m+1
    curmonth_numdays = str(monthrange(target_year, m+1)[-1]).rjust(2,'0')
    selecteddate = f'{target_year}-{curmonth}-15' if curmonth != 4 else f'{target_year}-{curmonth}-19'
    startdate = f'{target_year}-{curmonth}-01'
    enddate = f'{target_year}-{curmonth}-{curmonth_numdays}'

    # define the type of destination (i.e., CITY or AIRPORT)
    _DESTINATIONTYPE_ = 'AIRPORT'

    # target url
    url = f'{base_url}ms-gateway/tix-price-summary/priceSummary/priceSummary?adult={num_adult}&child={num_child}&infant={num_infant}&origin={_ORIGIN_}&destination={_DESTINATION_}&startDate={startdate}&endDate={enddate}&currency=IDR&originType={_ORIGINTYPE_}&destinationType={_DESTINATIONTYPE_}&cabinClass={_CABINCLASS_}&flightAggregateType=OW'

    # referer url for headers setup
    referer = f'{base_url}pesawat/search?d={_ORIGIN_}&a={_DESTINATION_}&dType={_ORIGINTYPE_}&aType={_DESTINATIONTYPE_}&date={selecteddate}&adult={num_adult}&child={num_child}&infant={num_infant}&class={_CABINCLASS_.lower()}&flexiFare=false'

    # header config
    header = {
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
        'Referer': referer,
        'X-Country-Code': 'IDN',
        'X-Request-id': '064f288d-65a8-4afb-87a0-a5a0647be197',
        'X-Store-Id': 'TIKETCOM'
    }

    # scraping process
    
    try:
      page = requests.get(url, headers=header)
      response = json.loads(page.text)
      _data_ += [{'extract_timestamp':exec_timestamp,
                  'origin': _ORIGIN_,
                  'destination': _DESTINATION_,
                  'depart_date':r['date'], 
                  'best_price':r['price']} for r in response['data']]
    except:
      print(f'Fail to extract data (destination: {_DESTINATION_}, {target_year}-{curmonth})!')
      print(f'error: {page}')

  # delay execution to avoid response 429 (too many requests)
  if i>0 and i % 9 == 0:  
    print()
    print('--Delay execution for 5 mins--')
    time.sleep(300)
    print()



BTJ


100%|██████████| 6/6 [00:01<00:00,  3.32it/s]


BDO


100%|██████████| 6/6 [00:01<00:00,  3.38it/s]


MLG


100%|██████████| 6/6 [00:01<00:00,  3.26it/s]


SRG


100%|██████████| 6/6 [00:01<00:00,  3.23it/s]


SUB


100%|██████████| 6/6 [00:01<00:00,  3.27it/s]


JOG


100%|██████████| 6/6 [00:01<00:00,  3.31it/s]


YIA


100%|██████████| 6/6 [00:01<00:00,  3.35it/s]


SOC


100%|██████████| 6/6 [00:01<00:00,  3.17it/s]


TKG


100%|██████████| 6/6 [00:01<00:00,  3.33it/s]


BTH


100%|██████████| 6/6 [00:01<00:00,  3.25it/s]



--Delay execution for 5 mins--

BKS


100%|██████████| 6/6 [00:01<00:00,  3.31it/s]


DJB


100%|██████████| 6/6 [00:01<00:00,  3.29it/s]


KNO


100%|██████████| 6/6 [00:01<00:00,  3.34it/s]


PDG


100%|██████████| 6/6 [00:01<00:00,  3.28it/s]


PLM


100%|██████████| 6/6 [00:01<00:00,  3.33it/s]


PGK


100%|██████████| 6/6 [00:01<00:00,  3.35it/s]


PKU


100%|██████████| 6/6 [00:01<00:00,  3.38it/s]


TNJ


100%|██████████| 6/6 [00:01<00:00,  3.31it/s]


BPN


100%|██████████| 6/6 [00:01<00:00,  3.39it/s]



--Delay execution for 5 mins--

BDJ


100%|██████████| 6/6 [00:01<00:00,  3.26it/s]


PNK


100%|██████████| 6/6 [00:01<00:00,  3.33it/s]


SRI


100%|██████████| 6/6 [00:01<00:00,  3.30it/s]


TRK


100%|██████████| 6/6 [00:01<00:00,  3.33it/s]


UPG


100%|██████████| 6/6 [00:01<00:00,  3.46it/s]


MDC


100%|██████████| 6/6 [00:01<00:00,  3.31it/s]


DPS


100%|██████████| 6/6 [00:01<00:00,  3.35it/s]


LOP


100%|██████████| 6/6 [00:01<00:00,  3.29it/s]


BIK


100%|██████████| 6/6 [00:01<00:00,  3.32it/s]



--Delay execution for 5 mins--

DJJ


100%|██████████| 6/6 [00:01<00:00,  3.35it/s]


MKQ


100%|██████████| 6/6 [00:01<00:00,  3.30it/s]


In [3]:
# write/append data to the target csv file
if _data_:
  df = pd.DataFrame(_data_)
  if not os.path.isfile(output_file):
    df.to_csv(output_file, index=False, sep='|')
    print(f'data has been exported to {output_file}')
  else:
    df.to_csv(output_file, index=False, sep='|', mode='a', header=False)
    print(f'file {output_file} has been appended with new data')
else:
  print('No data found')

file tiketcom_bestprice.csv has been appended with new data


In [4]:
df_new = pd.read_csv(output_file, sep='|')
print(f'number of records: {len(df_new)}')

number of records: 14779


In [5]:
df_new.head()

Unnamed: 0,extract_timestamp,origin,destination,depart_date,best_price
0,2022-12-24 11:25:37.656571,JKTC,BTJ,2023-04-25,2310582.0
1,2022-12-24 11:25:37.656571,JKTC,BTJ,2023-04-04,2310582.0
2,2022-12-24 11:25:37.656571,JKTC,BTJ,2023-04-27,2310582.0
3,2022-12-24 11:25:37.656571,JKTC,BTJ,2023-04-29,2310582.0
4,2022-12-24 11:25:37.656571,JKTC,BTJ,2023-04-01,2316313.0


In [6]:
# observe best price from jakarta to KNO
df_new[df_new['destination'] == 'KNO'].sort_values('depart_date', ascending=True)

Unnamed: 0,extract_timestamp,origin,destination,depart_date,best_price
6520,2022-12-29 03:13:11.798742,JKTC,KNO,2023-01-01,936450.0
11729,2022-12-31 01:49:22.466131,JKTC,KNO,2023-01-01,1021490.0
6501,2022-12-29 03:13:11.798742,JKTC,KNO,2023-01-02,936450.0
11704,2022-12-31 01:49:22.466131,JKTC,KNO,2023-01-02,936450.0
6502,2022-12-29 03:13:11.798742,JKTC,KNO,2023-01-03,936450.0
...,...,...,...,...,...
6665,2022-12-29 03:13:11.798742,JKTC,KNO,2023-06-29,1377550.0
11868,2022-12-31 01:49:22.466131,JKTC,KNO,2023-06-29,1377550.0
11879,2022-12-31 01:49:22.466131,JKTC,KNO,2023-06-30,1377550.0
6676,2022-12-29 03:13:11.798742,JKTC,KNO,2023-06-30,1377550.0
