<a href="https://colab.research.google.com/github/akbarriki/scraping_traveloka/blob/main/Scraping_Hotel_Traveloka.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initialisation

In [1]:
import pandas as pd
import numpy as np
import requests, json, math, datetime
from tqdm import tqdm
from datetime import datetime as dt
from datetime import date
from urllib.parse import quote

# cities and the corresponding geocodes
city_geocodes = {
    'Jakarta': 102813,
    'Bandung': 103859,
    'Medan': 106161,
    'Denpasar': 102747,
    'Banda Aceh': 101116,
    'Padang': 101687,
    'Palembang':101980,
    'Bogor': 103909,
    'Semarang': 106587,
    'Yogyakarta': 107442,
    'Solo': 106469,
    'Surabaya': 103570
}


# api url
api_url = 'https://www.traveloka.com/api/v2/hotel/searchList'

# all the collected data will be stored in this list
_data_ = []

###Functions

In [2]:
# padding the number with '0' into two-digit format 
def padNumber(num):
  return str(num).rjust(2,'0')


# collect data based on the selected reservation at each city in `cities`
def collectData(response):
  global columns
  hotels = []
  
  for _, entry in enumerate(tqdm(response['data']['entries'])):
    try:
      sample = entry['data']['inventoryList'][0]
    except:      
      sample = entry['data']
    
    record = {'timestamp': dt.now(), 'city': city,'checkin': startdate,'checkout': enddate,'num_staying_nights':checkin_numnights}

    # try:
    #   print(sample['displayName'])
    # except:
    #   print(sample)
    #   break
    

    for c in columns:
      try:
        if c == 'hotelFeatures':
          record.update({c:', '.join([c_['text'] for c_ in sample[c]])})
          continue
        if c == 'hotelInventorySummary':
          record.update({
              'cheapestRate_allNights_baseFare': sample[c]['cheapestRateDisplay']['baseFare']['amount'],
              'cheapestRate_allNights_fees': sample[c]['cheapestRateDisplay']['fees']['amount'],
              'cheapestRate_allNights_taxes': sample[c]['cheapestRateDisplay']['taxes']['amount'],
              'cheapestRate_allNights_totalFare': sample[c]['cheapestRateDisplay']['totalFare']['amount'],
              'cheapestRate_perNight_baseFare': int(float(sample[c]['cheapestRateDisplay']['baseFare']['amount']) / checkin_numnights),
              'cheapestRate_perNight_fees': round(float(sample[c]['cheapestRateDisplay']['fees']['amount']) / checkin_numnights,2),
              'cheapestRate_perNight_taxes': round(float(sample[c]['cheapestRateDisplay']['taxes']['amount']) / checkin_numnights,2),
              'cheapestRate_perNight_totalFare': round(float(sample[c]['cheapestRateDisplay']['totalFare']['amount']) / checkin_numnights),
              'originalRate_allNights_baseFare': sample[c]['originalRateDisplay']['baseFare']['amount'],
              'originalRate_allNights_fees': sample[c]['originalRateDisplay']['fees']['amount'],
              'originalRate_allNights_taxes': sample[c]['originalRateDisplay']['taxes']['amount'],
              'originalRate_allNights_totalFare': sample[c]['originalRateDisplay']['totalFare']['amount'],
              'originalRate_perNight_baseFare': int(float(sample[c]['originalRateDisplay']['baseFare']['amount']) / checkin_numnights),
              'originalRate_perNight_fees': round(float(sample[c]['originalRateDisplay']['fees']['amount']) / checkin_numnights,2),
              'originalRate_perNight_taxes': round(float(sample[c]['originalRateDisplay']['taxes']['amount']) / checkin_numnights,2),
              'originalRate_perNight_totalFare': round(float(sample[c]['originalRateDisplay']['totalFare']['amount']) / checkin_numnights)
          })    
          continue
        record[c] = sample[c]
      except:
        record[c] = '-'
    hotels.append(record)
  return hotels

###Scraping

In [4]:
# new year reservation
curyear = date.today().year # current year
startdd, startmm = 31, 12 # check in date and month
enddd, endmm = 2, 1 # check out date and month

# new year staying range
startdate = padNumber(startdd) + '-' + padNumber(startmm) + '-' + str(curyear)
enddate = padNumber(enddd) + '-' + padNumber(endmm) + '-' + str(curyear+1)

# number of staying night(s)
checkin_numnights = (dt.strptime(enddate, '%d-%m-%Y')-dt.strptime(startdate, '%d-%m-%Y')).days


# data columns to be extracted
columns = ['id','name', 'displayName', 'region', 'starRating', 'userRating','numReviews','userRatingInfo',
           'latitude','longitude','lowRate','highRate',
           'hotelFeatures','hotelSeoUrl','hotelInventorySummary']

print(f'Scraping All Available Hotel Rooms, Checkin {startdate}, Checkout:{enddate}, Staying nights: {checkin_numnights}:')
print()

for city in city_geocodes.keys():
  print()
  geocode = city_geocodes[city]
  print(city.upper())

  # referer url
  referer_url = f'https://www.traveloka.com/id-id/hotel/search?spec={startdate}.{enddate}.1.1.HOTEL_GEO.{geocode}.{quote(city)}.2'

  # headers
  headers = {
      'content-type': 'application/json',
      'cookie': '_gcl_au=1.1.141782728.1670394296; _gac_UA-29776811-12=1.1670394296.Cj0KCQiA7bucBhCeARIsAIOwr-9NHNVdQqtQnRrovoGHLpACWxlx50k6KyMR6yMRUacNvmsKp_P3V7EaAsihEALw_wcB; tv-repeat-visit=true; _gid=GA1.2.137383459.1672284459; g_state={"i_l":1,"i_p":1672291669115}; _ga=GA1.1.1144801166.1670394296; amp_1a5adb=lmXek5GnRbmDVMBIye772l...1gldunjc7.1gldup1ge.c.0.c; tvl=qgdHX7GvehrD9XH5a3S4PdE8AYpuF3hYPaT5bxhY7ZYlTfL+WyvcrSI/VxnhD+GdIauD2fuQAp48xn5SOy61CcGKsORldom9dTN23+66MAYIHEn0dmPxsmhM3nEpAG8sgD1ega4KxIBCDYlQGDuVKUzPw3pNExw5Cd1OxjjrNg3vlyHfFnPptZUxAgMVwRNSCMYWUJplNNMY2P4/83O9X+8GNrPf8Ng75ZieUaJama8=; tvs=qgdHX7GvehrD9XH5a3S4PWL3Nd74xArIuT+JzcRMbKddQHovERAJ9HWRLrAaZ0jPhWj5HSxm0ZKiRbldET1ham2PeYg1sQr2h/wIBjIyPQ1JQfOnq9PrXiJXCb7pG+GuL55zGx9BHnW6AktSohrCEcVZJJEBlMy+/xGmAFjHYdanG44/La0X6wsaDJDc5dQI3jW7f6f85zK7XA1xLrLbn3wpMY91AYFzJ6h8za/vSrng40uUoDT+qJIv0oQGNB1A; _fbp=fb.1.1672284509886.851897136; cto_bundle=LLTD719vOXhjJTJCWjlPeGhpNUJ1VEVFdHpLdUVraHZLa1Fkem44UDZ4R3FlNnRIcGhuaG8zSWhlVFd4TyUyRm5JOVUlMkZTSGJVZ2UyaDVsVWVzWm9ETlBRSTRweTlGRDg2eE81WnVHNXhzJTJGNkZsaVRlTzRHUGVzQ2llQTFXbVN0a3B2OXBSdWpXYWR5cXU5SGpjZTQ5SFk5dzRqR2NwUSUzRCUzRA; _ga_RSRSMMBH0X=GS1.1.1672284458.2.1.1672284554.60.0.0',
      'origin': 'https://www.traveloka.com',
      'referer': referer_url,
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
      'x-domain': 'accomSearch',
      'x-route-prefix': 'id-id'    
  }

  # payload
  payload = '{"fields":[],"data":{"checkInDate":{"year":"'+startdate.split('-')[-1]+'","month":"'+str(startmm)+'","day":"'+str(startdd)+'"},"checkOutDate":{"year":"'+enddate.split('-')[-1]+'","month":"'+str(endmm)+'","day":"'+str(enddd)+'"},"numOfNights":'+str(checkin_numnights)+',"currency":"IDR","numAdults":1,"numChildren":0,"childAges":[],"numInfants":0,"numRooms":1,"ccGuaranteeOptions":{"ccInfoPreferences":["CC_TOKEN","CC_FULL_INFO"],"ccGuaranteeRequirementOptions":["CC_GUARANTEE"]},"rateTypes":["PAY_NOW","PAY_AT_PROPERTY"],"isJustLogin":false,"backdate":false,"geoId":"'+str(geocode)+'","monitoringSpec":{"lastKeyword":"'+city+'","referrer":"'+referer_url+'","searchId":null,"searchFunnelType":null,"isPriceFinderActive":null,"dateIndicator":null,"bannerMessage":"","displayPrice":null},"showHidden":false,"locationName":"'+city+'","sourceType":"HOTEL_GEO","isExtraBedIncluded":true,"isUseHotelSearchListAPI":true,"supportedDisplayTypes":["INVENTORY","INVENTORY_LIST","HEADER","INVENTORY_WITH_HEADER"],"userSearchPreferences":[],"uniqueSearchId":null,"basicFilterSortSpec":{"basicSortType":"POPULARITY","ascending":false,"criteriaFilterSortSpec":null,"accommodationTypeFilter":[],"starRatingFilter":[true,true,true,true,true],"facilityFilter":[],"hasFreeCancellationRooms":false,"minPriceFilter":null,"maxPriceFilter":null,"quickFilterId":null,"skip":0,"top":100},"criteriaFilterSortSpec":null,"boundaries":null,"contexts":{"isFamilyCheckbox":false}},"clientInterface":"desktop"}'
  
  # try:
  print('preparing post ...')
  page = requests.post(api_url, headers=headers, json=json.loads(payload))
  print('preparing data collection ...')
  response = json.loads(page.text)
  _data_ += collectData(response)
  print()
  # except:
    # print(f'Error: Failed to Extract Data\n Error: {page}')
    # print('detail Error:')
    # print(page.text)
    # break

  
print()
print(f'Num of obtained records: {len(_data_)}')
print()


Scraping All Available Hotel Rooms, Checkin 31-12-2022, Checkout:02-01-2023, Staying nights: 2:


JAKARTA
preparing post ...
preparing data collection ...


100%|██████████| 63/63 [00:00<00:00, 48122.59it/s]




BANDUNG
preparing post ...
preparing data collection ...


100%|██████████| 53/53 [00:00<00:00, 38413.36it/s]




MEDAN
preparing post ...
preparing data collection ...


100%|██████████| 43/43 [00:00<00:00, 12224.98it/s]



DENPASAR
preparing post ...





preparing data collection ...


100%|██████████| 1/1 [00:00<00:00, 5607.36it/s]



BANDA ACEH
preparing post ...





preparing data collection ...


100%|██████████| 28/28 [00:00<00:00, 19287.32it/s]



PADANG
preparing post ...





preparing data collection ...


100%|██████████| 40/40 [00:00<00:00, 24603.63it/s]



PALEMBANG
preparing post ...





preparing data collection ...


100%|██████████| 53/53 [00:00<00:00, 32513.98it/s]



BOGOR
preparing post ...





preparing data collection ...


100%|██████████| 33/33 [00:00<00:00, 15511.83it/s]



SEMARANG
preparing post ...





preparing data collection ...


100%|██████████| 24/24 [00:00<00:00, 9236.86it/s]



YOGYAKARTA
preparing post ...





preparing data collection ...


100%|██████████| 51/51 [00:00<00:00, 10938.86it/s]



SOLO
preparing post ...





preparing data collection ...


100%|██████████| 32/32 [00:00<00:00, 13374.96it/s]



SURABAYA
preparing post ...





preparing data collection ...


100%|██████████| 53/53 [00:00<00:00, 11641.08it/s]



Num of obtained records: 904






###Scraping Result Checks

In [5]:
df = pd.DataFrame(_data_)
df.head()

Unnamed: 0,timestamp,city,checkin,checkout,num_staying_nights,id,name,displayName,region,starRating,...,cheapestRate_perNight_taxes,cheapestRate_perNight_totalFare,originalRate_allNights_baseFare,originalRate_allNights_fees,originalRate_allNights_taxes,originalRate_allNights_totalFare,originalRate_perNight_baseFare,originalRate_perNight_fees,originalRate_perNight_taxes,originalRate_perNight_totalFare
0,2022-12-29 20:41:15.405526,Jakarta,30-12-2022,02-01-2023,3,62303,The Sultan Hotel & Residence Jakarta,The Sultan Hotel Jakarta,"Senayan, Jakarta",5.0,...,415100.0,2391767,6300000,0,1323000,7623000,2100000,0.0,441000.0,2541000
1,2022-12-29 20:41:15.405705,Jakarta,30-12-2022,02-01-2023,3,2000000141144,Aryaduta Suite Semanggi,Aryaduta Suite Semanggi,"Karet Semanggi, Jakarta",4.0,...,416665.33,2400787,6196051,0,1301171,7497222,2065350,0.0,433723.67,2499074
2,2022-12-29 20:41:15.405741,Jakarta,30-12-2022,02-01-2023,3,9000000952972,Aloft South Jakarta,Aloft South Jakarta,"Cilandak Timur, Jakarta",4.0,...,0.0,1294700,4280000,898800,0,5178800,1426666,299600.0,0.0,1726267
3,2022-12-29 20:41:15.405779,Jakarta,30-12-2022,02-01-2023,3,9000000971405,"The Langham, Jakarta","The Langham, Jakarta","Senayan, Jakarta",5.0,...,1358931.0,7830031,25884400,0,5435724,31320124,8628133,0.0,1811908.0,10440041
4,2022-12-29 20:41:15.405806,Jakarta,30-12-2022,02-01-2023,3,4755,"Hotel Mulia Senayan, Jakarta","Hotel Mulia Senayan, Jakarta","Senayan, Jakarta",5.0,...,945000.0,5445000,17999999,0,3780000,21779999,5999999,0.0,1260000.0,7260000


In [14]:
df.groupby(['checkin','city'])['city'].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
checkin,city,Unnamed: 2_level_1
30-12-2022,Banda Aceh,25
30-12-2022,Bandung,49
30-12-2022,Bogor,25
30-12-2022,Denpasar,1
30-12-2022,Jakarta,58
30-12-2022,Medan,41
30-12-2022,Padang,33
30-12-2022,Palembang,51
30-12-2022,Semarang,19
30-12-2022,Solo,31


In [15]:
len(df)

904

###Export Result

In [16]:
df.to_csv('traveloka_newyearhotels.csv', sep='|', index=False)