<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/Tripadvisor_Reviews_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title

# activate/uncomment these lines if you run this script on Google Colab to install missing modules/libraries,
# otherwise deactivate/comment them and install it manually from your terminal
!pip install "httpx[http2,brotli]" parsel
!pip install loguru

# import modules
import re
import httpx
import json
import random
import string
import difflib
import math
import numpy as np
import pandas as pd
from google.colab import files
from loguru import logger as log

# import the excel file from repo if you run this script on Google Colab or deactivate it if you run it in your machine
!wget 'https://github.com/amien1410/my_csvs/raw/main/sudeep.xlsx'

# get inputs from excel files, determine it's location on your machine
# and store it in 'inputs' list variable
filepath = "/content/sudeep.xlsx"
df = pd.read_excel(filepath)
inputs = []
for i in df.index:
    inputs.append({
        "propertyName": df['Property Name'][i],
        "rooms": df['Rooms'][i],
        "address": df['Property Address'][i],
        "city": df['City'][i],
        "state": df['State'][i],
        "zipcode": df['Zip'][i],
        "lat": df['Latitude'][i],
        "long": df['Longitude'][i]
    })

# To avoid being instantly blocked we'll be using request headers that
# mimic Chrome browser on Windows
BASE_HEADERS = {
    "authority": "www.tripadvisor.com",
    "accept-language": "en-US,en;q=0.9",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "accept-language": "en-US;en;q=0.9",
    "accept-encoding": "gzip, deflate, br",
}
# start HTTP session client with our headers and HTTP2
client = httpx.AsyncClient(
    http2=True,  # http2 connections are significantly less likely to get blocked
    headers=BASE_HEADERS,
    timeout=httpx.Timeout(15.0),
    limits=httpx.Limits(max_connections=5),
)

# Below is the helper functions:

# 1 - degrees to radians
def deg2rad(degrees):
    return math.pi*degrees/180.0
# 2 - radians to degrees
def rad2deg(radians):
    return 180.0*radians/math.pi

# Semi-axes of WGS-84 geoidal reference
WGS84_a = 6378137.0  # Major semiaxis [m]
WGS84_b = 6356752.3  # Minor semiaxis [m]

# 3 - Earth radius at a given latitude, according to the WGS-84 ellipsoid [m]
def WGS84EarthRadius(lat):
    # http://en.wikipedia.org/wiki/Earth_radius
    An = WGS84_a*WGS84_a * math.cos(lat)
    Bn = WGS84_b*WGS84_b * math.sin(lat)
    Ad = WGS84_a * math.cos(lat)
    Bd = WGS84_b * math.sin(lat)
    return math.sqrt( (An*An + Bn*Bn)/(Ad*Ad + Bd*Bd) )

# 4 - Bounding box surrounding the point at given coordinates,
# assuming local approximation of Earth surface as a sphere
# of radius given by WGS84
def boundingBox(latitudeInDegrees, longitudeInDegrees, halfSideInKm):
    lat = deg2rad(latitudeInDegrees)
    lon = deg2rad(longitudeInDegrees)
    halfSide = 1000*halfSideInKm

    # Radius of Earth at given latitude
    radius = WGS84EarthRadius(lat)
    # Radius of the parallel at given latitude
    pradius = radius*math.cos(lat)

    latMin = lat - halfSide/radius
    latMax = lat + halfSide/radius
    lonMin = lon - halfSide/pradius
    lonMax = lon + halfSide/pradius

    return rad2deg(latMin), rad2deg(lonMin), rad2deg(latMax), rad2deg(lonMax)

# 5 - Haversine function to determine the distance between two geo location
def haversine(lat1, lon1, lat2, lon2):
  lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
  R = 6371.0
  dlat = lat2 - lat1
  dlon = lon2 - lon1

  # haversine formula
  a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
  c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
  distance = R * c

  return distance

# Below are the main functions

# Get 'geoId' from given city or location
async def get_geo_id(query, client):

    # set the payload request
    payload = [
        {
          "query": "5eec1d8288aa8741918a2a5051d289ef",
          "variables": {
            "request": {
              "query": query,
              "limit": 10,
              "scope": "WORLDWIDE",
              "locale": "en-US",
              "scopeGeoId": 1,
              "searchCenter": None,
              "types": [
                "LOCATION"
              ],
              "locationTypes": [
                "GEO",
                "AIRPORT",
                "ACCOMMODATION",
                "ATTRACTION",
                "ATTRACTION_PRODUCT",
                "EATERY",
                "NEIGHBORHOOD",
                "AIRLINE",
                "SHOPPING",
                "UNIVERSITY",
                "GENERAL_HOSPITAL",
                "PORT",
                "FERRY",
                "CORPORATION",
                "VACATION_RENTAL",
                "SHIP",
                "CRUISE_LINE",
                "CAR_RENTAL_OFFICE"
              ],
              "userId": None,
              "articleCategories": [
                "default",
                "love_your_local",
                "insurance_lander"
              ],
              "enabledFeatures": [
                "typeahead-q"
              ]
            }
          }
        }
      ]


    # we need to generate a random request ID for this request to succeed
    random_request_id = "".join(
        random.choice(string.ascii_lowercase + string.digits) for i in range(180)
    )

    # set the headers
    headers = {
        "X-Requested-By": random_request_id,
        "Referer": "https://www.tripadvisor.com/Hotels",
        "Origin": "https://www.tripadvisor.com",
    }

    # get the page with method POST and pass the payload and headers as the parameters
    result = await client.post(
        url="https://www.tripadvisor.com/data/graphql/ids",
        json=payload,
        headers=headers,
    )

    # get the results from page content
    data = json.loads(result.content)
    results = data[0]["data"]["Typeahead_autocomplete"]["results"]
    results = [r["details"] for r in results]

    # get and return the geoId from results above
    metadata = results[0]
    return re.findall(r'\d+', metadata['url'])[0]

# this function is to get properties based on given geoId, latitude and longitude,
# and return the locationId from the nearest property with given latitude and longitude point
async def scrape_hotels(geoId, latitude, longitude, client):

  # get the bounding box based on given latitude and longitude
  latMin, lonMin, latMax, lonMax = boundingBox(latitude, longitude, 1)

  # set a list variable to store founded properties result
  hotels = []

  # set the payload
  payload = [
    {
      "query": "a10df0f3b4bf06a124ca09c10d59ae3c", # Never leave it as None or empty
      "variables": {
        "geoId": None,
        "blenderId": None,
        "boundingBox": {
          "southWestCorner": {
            "latitude": latMin,
            "longitude": lonMin
          },
          "northEastCorner": {
            "latitude": latMax,
            "longitude": lonMax
          }
        },
        "centerAndRadius": None,
        "travelInfo": {
          "usedDefaultDates": True,
          "checkInDate": "2023-07-14", # You can put any date, it does'nt matter, but never leave it empty,
          "checkOutDate": "2023-07-14", # You can put any date, it does'nt matter, but never leave it empty,
          "rooms": 0, # Never leave it as None or empty
          "adults": 0, # Never leave it as None or empty
          "childrenAges": []
        },
        "currency": "USD",
        "pricingMode": None,
        "filters": {
          "minRating": None,
          "neighborhoodsOrNear": None,
          "priceRange": None,
          "amenities": None,
          "brands": None,
          "classes": None,
          "styles": None,
          "hoteltypes": None,
          "categories": None,
          "anyTags": None
        },
        "offset": 0,
        "limit": 10000, # I have try number 30, 100 and 10000, it works fine
        "sort": "BEST_VALUE",
        "clientType": "DESKTOP",
        "productId": "Hotels",
        "pageviewId": "", # It is not importan for now, We can leave it empty
        "sessionId": "", # It is not importan for now, We can leave it empty
        "amenityLimit": 0, # Never leave it as None or empty
        "route": {
          "page": "HotelsFusion",
          "params": {
            "geoId": geoId,
            "contentType": "hotel",
            "webVariant": "HotelsFusion"
          }
        },
        "userEngagedFilters": False,
        "isMapView": True,
        "polling": False,
        "tertiaryOffers": False,
        "includePhotoSizes": False,
        "requestNumber": 1
      }
    }
  ]

  # we need to generate a random request ID for this request to succeed
  random_request_id = "".join(
      random.choice(string.ascii_lowercase + string.digits) for i in range(180)
  )

  # set the headers
  headers = {
      "X-Requested-By": random_request_id,
      "Referer": "https://www.tripadvisor.com/Hotels",
      "Origin": "https://www.tripadvisor.com",
  }

  # get the page with method POST and pass the payload and headers as the parameters
  result = await client.post(
      url="https://www.tripadvisor.com/data/graphql/ids",
      json=payload,
      headers=headers,
  )

  # get the results from page content
  data = json.loads(result.content)
  results = data[0]["data"]["list"]["results"]

  # fetch and format the results, then store it into 'hotels' list
  for i in range(len(results)):
    try:
      name = results[i]["location"]["locationV2"]["names"]["name"]
      if name == None:
        name = ""

      # print(name)
      lat2 = results[i]["location"]["locationV2"]["geocode"]["latitude"]
      lon2 = results[i]["location"]["locationV2"]["geocode"]["longitude"]
      distance = haversine(latitude, longitude, lat2, lon2)
      hotels.append({
          "locationId": results[i]["locationId"],
          "name": name,
          "address": results[i]["location"]["locationV2"]["contact"]["streetAddress"]["fullAddress"],
          "count": results[i]["location"]["reviewSummary"]["count"],
          "rating": results[i]["location"]["reviewSummary"]["rating"],
          "latitude": lat2,
          "longitude": lon2,
          "distance": distance
      })
    except:
      continue

  # print total of the found properties
  log.info(f"Found {len(hotels)} hotels.")

  # get the nearest hotel/property from 'hotel' list with the given latitude and longitude
  min_distance = 1.0
  nearest = None
  for i in range(len(hotels)):
    if hotels[i]['distance'] < min_distance:
      min_distance = hotels[i]['distance']
      nearest = hotels[i]

  # print the nearest property and return it as locationId
  log.info(f"The nearest one with the given latitude and longitude is {nearest['name']} just {nearest['distance']} from the target.")
  return nearest

# this function is to scrape reviews from given locationId (nearest property)
async def scrape_reviews(geoId, locId, address, inputs):

  # set total_reviews, offset and hasNextPage variables
  total_reviews = []
  offset = 0
  hasNextPage = True

  # stop function when it reachs the last page
  while hasNextPage == True:

    # set the payload
    payload = [
      {
        "query": "ea9aad8c98a6b21ee6d510fb765a6522",
        "variables": {
          "locationId": locId,
          "offset": offset,
          "filters": [

          ],
          "prefs": None,
          "initialPrefs": {},
          "limit": 10,
          "filterCacheKey": "locationReviewFilters_"+str(locId),
          "prefsCacheKey": "locationReviewPrefs_"+str(locId),
          "needKeywords": False,
          "keywordVariant": "location_keywords_v2_llr_order_30_en"
        }
      }
    ]

    # we need to generate a random request ID for this request to succeed
    random_request_id = "".join(
        random.choice(string.ascii_lowercase + string.digits) for i in range(180)
    )

    # set headers
    headers = {
        "X-Requested-By": random_request_id,
        "Referer": "https://www.tripadvisor.com/Hotels",
        "Origin": "https://www.tripadvisor.com",
    }

    # get the page with method POST and pass the payload and headers as the parameters
    result = await client.post(
        url="https://www.tripadvisor.com/data/graphql/ids",
        json=payload,
        headers=headers,
    )

    # get the results from page content
    data = json.loads(result.content)
    # print(data[0]["data"]["locations"][0]["reviewListPage"]["reviews"])
    reviews = data[0]["data"]["locations"][0]["reviewListPage"]["reviews"]

    # fetch and format the results, then store it into 'total_reviews' list
    for i in range(len(reviews)):

      # get username
      username = reviews[i]["userProfile"]
      if username == None:
        username = ""
      else:
        username = username.get("displayName","")

      # get property name
      newPropertyName = reviews[i]["location"]
      if newPropertyName == None:
        newPropertyName = ""
      else:
        newPropertyName = newPropertyName.get("name","")

      # get hotel/owner response
      response = reviews[i]["mgmtResponse"]
      if response == None:
        response = ""
      else:
        response = response.get("text","")

      # get stayDate
      stayDate = reviews[i]['tripInfo']
      if stayDate == None:
        stayDate = ""
      else:
        stayDate = stayDate.get("stayDate","")

      # get categories review
      additionalRatings = reviews[i]["additionalRatings"]
      title = reviews[i].get('title', '')
      text = reviews[i].get('text', '')
      createdDate = reviews[i].get('createdDate', '')
      rating = reviews[i].get('rating', '')
      categoriesReview = []
      if len(additionalRatings) > 0:
        # categoriesReview = []
        for i in additionalRatings:
          categoriesReview.append(f"{i['ratingLabel']}:{i['rating']}")
        categoriesReview = ", ".join(categoriesReview)
      else:
        categoriesReview = ""

      # names.append(newPropertyName)
      try:
        total_reviews.append({
            "oldPropertyName": inputs["propertyName"],
            "newPropertyName": newPropertyName,
            "rooms": inputs["rooms"],
            "oldAddress": inputs["address"],
            "newAddress": address,
            "city": inputs["city"],
            "state": inputs["state"],
            "zipcode": inputs["zipcode"],
            "lat": inputs["lat"],
            "long": inputs["long"],
            "title": title,
            "text": text,
            "response": response,
            "additionalRatings": categoriesReview,
            "createdDate": createdDate,
            "stayDate": stayDate,
            "userName": username,
            "rating": rating
        })

      # if it gets an error, continue
      except:
        continue

    # if results length less than 20, set hasNextPage variable as False
    if len(reviews) < 10:
      hasNextPage = False

    # add offset value by 20 if there is a next page or 'hasNextPage' is still True
    offset += 10

  # return total_reviews as the result
  log.info(f"Total of reviews from this hotel/property is: {len(total_reviews)}")
  return total_reviews
  # df = pd.DataFrame(total_reviews)
  # return df

# this is the main function that wraps all functions above
async def main():

  # set results list variable
  results = []

  # fetch the inputs
  for i in range(len(inputs)): # It is the real one
  # for i in range(0,5): # It is for testing
    try:
      # get the geoId
      geoId = await get_geo_id(inputs[i]["city"], client)
      # get the properties
      hotel = await scrape_hotels(geoId, inputs[i]["lat"], inputs[i]["long"], client)
      # get the reviews
      reviews = await scrape_reviews(geoId, hotel['locationId'], hotel['address'], inputs[i])
      results = results + reviews

    # if there is an error, continue to the next input
    except:
      continue

  # print the log
  log.info(f"Total of reviews is: {len(results)}")

  # convert the results list into a dataframe -> excel -> and download it
  df = pd.DataFrame(results)
  # df.to_excel("Results.xlsx")

  # return the dataframe to show it
  return df

In [None]:
# RUN THIS CELL TO SEE THE RESULT
df = await main()
df

[32m2023-09-12 03:01:51.411[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_hotels[0m:[36m322[0m - [1mFound 86 hotels.[0m
[32m2023-09-12 03:01:51.418[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_hotels[0m:[36m333[0m - [1mThe nearest one with the given latitude and longitude is Motel 6 San Diego, CA - Downtown just 0.10082159395556078 from the target.[0m
[32m2023-09-12 03:02:08.744[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_reviews[0m:[36m473[0m - [1mTotal of reviews from this hotel/property is: 415[0m
[32m2023-09-12 03:02:09.932[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_hotels[0m:[36m322[0m - [1mFound 101 hotels.[0m
[32m2023-09-12 03:02:09.935[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_hotels[0m:[36m333[0m - [1mThe nearest one with the given latitude and longitude is The Keating Hotel By Pininfarina just 0.011208417015366078 from the target.[0m
[32m2023-09-12 03:02:48.169[0m | [1mINFO    [0m | [36m

In [None]:
# This cell is for testing, just ignore it
await scrape_reviews(60750 , 226611 , "1546 2nd Ave I-5/San Diego Freeway at 6th Avenue, San Diego, CA 92101-3006", {'propertyName': 'Brunswick', 'rooms': 35, 'address': '1468 1st Ave', 'city': 'San Diego', 'state': 'CA', 'zipcode': '92101-3011', 'lat': 32.7207426, 'long': -117.1640128})