<a href="https://colab.research.google.com/github/akbarriki/scraping_tiketcom/blob/main/Scraping_Tiketcom_BestPrice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Scraping Best Price for Flights from Jakarta to Selected Airports in Indonesia**

##This dataset is collected from [Tiket.com](https://tiket.com), an Online Travel Agent Platform in Indonesia

Author: [Riki Akbar](https://akbarriki.github.io)
<br />Last update: 2 January 2023<br />Dataset is now available [here](https://www.kaggle.com/datasets/datasciencerikiakbar/tiketcom-best-price-for-flights-from-jakarta)

Feel free to make any code adjustment but **please keep the attribution remains**


###Imports and Mounting Drive

In [None]:
import pandas as pd
import requests, json, time, math, os
from tqdm import tqdm
from datetime import datetime as dt
from datetime import date
from calendar import monthrange
from dateutil.relativedelta import relativedelta
from google.colab import drive

#mounting google drive
drive.mount('/content/gdrive', force_remount=True)
project_folder = 'gdrive/MyDrive/scraping_projects/tiketcom'
output_file = 'tiketcom_bestprice.csv'
os.chdir(project_folder)
os.getcwd()

Mounted at /content/gdrive


'/content/gdrive/MyDrive/scraping_projects/tiketcom'

### Scraping

In [None]:
# define execution timestamp
exec_timestamp = dt.now()

# define all required variables
base_url = 'https://www.tiket.com/'

# target list
_data_ = []

num_adult, num_child, num_infant = 1, 0, 0
_ORIGIN_, _ORIGINTYPE_ = 'JKTC', 'CITY'
# _DESTINATION_, _DESTINATIONTYPE_ = 'BTJ', 'AIRPORT'
_CABINCLASS_ = 'ECONOMY'

target_year = date.today().year + 1

selecteddate = '2023-04-19'
startdate = '2023-04-01'
enddate = '2023-04-30'

_DESTINATIONLIST_ = ['BTJ','BDO','MLG','SRG','SUB','JOG','YIA','SOC','TKG','BTH',
                     'BKS','DJB','KNO','PDG','PLM','PGK','PKU','TNJ','BPN','BDJ',
                     'PNK','SRI','TRK','UPG','MDC','DPS','LOP','BIK','DJJ','MKQ']

for i, _DESTINATION_ in enumerate(_DESTINATIONLIST_):
  print(_DESTINATION_)
  for m in tqdm(range(6)):
        
    # define current month, number of days in current month, and default selected date (default selected date for april 2023 is 19)
    cdate = (dt.today() + relativedelta(months=m))
    cyear, cmonth = cdate.year, cdate.month
    startdate = dt.strftime(date(cyear, cmonth, 1), "%Y-%m-%d")
    enddate = dt.strftime(date(cyear, cmonth, monthrange(cyear, cmonth)[-1]), "%Y-%m-%d")
    selecteddate = dt.strftime(date(cyear, cmonth, 15), "%Y-%m-%d") if cmonth != 4 else dt.strftime(date(cyear, cmonth, 19), "%Y-%m-%d")
    
    # define the type of destination (i.e., CITY or AIRPORT)
    _DESTINATIONTYPE_ = 'AIRPORT'

    # target url
    url = f'{base_url}ms-gateway/tix-price-summary/priceSummary/priceSummary?adult={num_adult}&child={num_child}&infant={num_infant}&origin={_ORIGIN_}&destination={_DESTINATION_}&startDate={startdate}&endDate={enddate}&currency=IDR&originType={_ORIGINTYPE_}&destinationType={_DESTINATIONTYPE_}&cabinClass={_CABINCLASS_}&flightAggregateType=OW'

    # referer url for headers setup
    referer = f'{base_url}pesawat/search?d={_ORIGIN_}&a={_DESTINATION_}&dType={_ORIGINTYPE_}&aType={_DESTINATIONTYPE_}&date={selecteddate}&adult={num_adult}&child={num_child}&infant={num_infant}&class={_CABINCLASS_.lower()}&flexiFare=false'

    # header config
    header = {
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
        'Referer': referer,
        'X-Country-Code': 'IDN',
        'X-Request-id': '064f288d-65a8-4afb-87a0-a5a0647be197',
        'X-Store-Id': 'TIKETCOM'
    }

    # scraping process
    
    try:
      page = requests.get(url, headers=header)
      response = json.loads(page.text)
      _data_ += [{'extract_timestamp':exec_timestamp,
                  'origin': _ORIGIN_,
                  'destination': _DESTINATION_,
                  'depart_date':r['date'], 
                  'best_price':r['price']} for r in response['data'] if r['isBestPrice'] == 'true']
    except:
      print(f'Fail to extract data (destination: {_DESTINATION_}, {target_year}-{cmonth})!')
      print(f'error: {page}')

  # delay execution to avoid response 429 (too many requests)
  if i>0 and i % 9 == 0:  
    print()
    print('--Delay execution for 5 mins--')
    time.sleep(300)
    print()



BTJ


100%|██████████| 6/6 [00:01<00:00,  3.44it/s]


BDO


100%|██████████| 6/6 [00:01<00:00,  3.54it/s]


MLG


100%|██████████| 6/6 [00:01<00:00,  3.55it/s]


SRG


100%|██████████| 6/6 [00:01<00:00,  3.41it/s]


SUB


100%|██████████| 6/6 [00:01<00:00,  3.40it/s]


JOG


100%|██████████| 6/6 [00:01<00:00,  3.43it/s]


YIA


100%|██████████| 6/6 [00:01<00:00,  3.52it/s]


SOC


100%|██████████| 6/6 [00:01<00:00,  3.61it/s]


TKG


100%|██████████| 6/6 [00:01<00:00,  3.36it/s]


BTH


100%|██████████| 6/6 [00:01<00:00,  3.58it/s]



--Delay execution for 5 mins--

BKS


100%|██████████| 6/6 [00:01<00:00,  3.52it/s]


DJB


100%|██████████| 6/6 [00:01<00:00,  3.53it/s]


KNO


100%|██████████| 6/6 [00:01<00:00,  3.37it/s]


PDG


100%|██████████| 6/6 [00:01<00:00,  3.40it/s]


PLM


100%|██████████| 6/6 [00:01<00:00,  3.45it/s]


PGK


100%|██████████| 6/6 [00:01<00:00,  3.35it/s]


PKU


100%|██████████| 6/6 [00:01<00:00,  3.37it/s]


TNJ


100%|██████████| 6/6 [00:01<00:00,  3.53it/s]


BPN


100%|██████████| 6/6 [00:01<00:00,  3.43it/s]



--Delay execution for 5 mins--

BDJ


100%|██████████| 6/6 [00:01<00:00,  3.42it/s]


PNK


100%|██████████| 6/6 [00:01<00:00,  3.41it/s]


SRI


100%|██████████| 6/6 [00:01<00:00,  3.56it/s]


TRK


 50%|█████     | 3/6 [00:00<00:00,  3.50it/s]

### Export the Result

In [None]:
# write/append data to the target csv file
if _data_:
  df = pd.DataFrame(_data_)
  if not os.path.isfile(output_file):
    df.to_csv(output_file, index=False, sep='|')
    print(f'data has been exported to {output_file}')
  else:
    df.to_csv(output_file, index=False, sep='|', mode='a', header=False)
    print(f'file {output_file} has been appended with new data')
else:
  print('No data found')

###Sanity Checks

#####check the total number of records

In [None]:
df_new = pd.read_csv(output_file, sep='|')
print(f'number of records: {len(df_new)}')

#####check the top 5 rows

In [None]:
df_new.head()

#####observe the best price data for flights from Jakarta to KNO

In [None]:
df_new[df_new['destination'] == 'KNO'].sort_values('depart_date', ascending=True).head()

##### Make sure no alphanumeric entries in the `best_price` column

In [None]:
df_new.best_price.apply(lambda x: float(x))

##### Check the maximum departure date

In [None]:
df_new.depart_date.max()