<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/New_Tripadvisor_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "httpx[http2,brotli]" parsel

Collecting parsel
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting httpx[brotli,http2]
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore==1.* (from httpx[brotli,http2])
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting brotli (from httpx[brotli,http2])
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting h2<5,>=3 (from httpx[brotli,http2])
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx[brotli,http2])
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting cssselect>=1.2.0 (from parsel)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting jmespath (from parsel)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting w3lib>=1.19.0 (from parsel)
  Downloading w3lib-2.2.1-py3-none-any.

In [None]:
import re
import httpx
import json
import time
import string
import random
import datetime
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
def extract_rental_info(json_data):
    try:
        rental = json_data.get('rental', {})
        quick_view = rental.get('quickView', {})
        nearby_info = json_data.get('nearbyInfo', {})

        # Extract required fields with error handling for missing data
        rental_info = {
            'rental_id': rental.get('id', 'N/A'),
            'rental_name': rental.get('name', 'N/A'),
            'bath_count': rental.get('bathCount', 'N/A'),
            'room_count': rental.get('roomCount', 'N/A'),
            'sleep_count': rental.get('sleepCount', 'N/A'),
            'averageRatingNumber': rental.get('averageRatingNumber', 'N/A'),
            'url': rental.get('detailRoute', {}).get('url', 'N/A'),
            'hasPaymentProtection': rental.get('hasPaymentProtection', False),
            'isAffiliate': rental.get('isAffiliate', False),
            'affiliateLogoUrl': rental.get('affiliateLogoUrl') if rental.get('isAffiliate', False) else None,
            'photos': [
                {
                    'url': photo.get('dynamicUrl', 'N/A'),
                    'description': photo.get('description', 'N/A')
                } for photo in rental.get('photos', [])
            ] if rental.get('photos') else 'No photos available',
            'latitude': rental.get('geoCoordinates', {}).get('lat', 'N/A'),
            'longitude': rental.get('geoCoordinates', {}).get('lng', 'N/A'),
            'address': quick_view.get('address', 'N/A'),
            'rental_description': quick_view.get('description', 'N/A'),
            'rentalCategory': quick_view.get('rentalCategory', 'N/A'),
            'amenities': ', '.join(
                [amenity['value'].get('localizedText', 'N/A') for amenity in quick_view.get('amenities', [])]
            ) if quick_view.get('amenities') else 'No amenities available',

            # Handling nearby locations and concatenating names with commas
            'nearby_locationName': ', '.join(
                [nearby_info.get('locationName', 'N/A')]
            )
        }

        return rental_info

    except Exception as e:
        print(f"Error extracting rental information: {e}")
        return None

In [None]:
BASE_HEADERS = {
    "authority": "www.tripadvisor.com",
    "accept-language": "en-US,en;q=0.9",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "accept-language": "en-US;en;q=0.9",
    "accept-encoding": "gzip, deflate, br",
}
# start HTTP session client with our headers and HTTP2
client = httpx.AsyncClient(
    http2=True,  # http2 connections are significantly less likely to get blocked
    headers=BASE_HEADERS,
    timeout=httpx.Timeout(15.0),
    limits=httpx.Limits(max_connections=5),
)

In [None]:
payload = [{
  "variables":{
      "offset":0,
      "rentalCountLimit":100,
      "currencyCode":"USD",
      "urlParams":[],
      "sortOrder":"POPULARITY",
      "geoId":60763
  },
  "extensions":{
      "preRegisteredQueryId":"a926137f5e9dcd0c"
  }
}]

# we need to generate a random request ID for this request to succeed
random_request_id = "".join(
    random.choice(string.ascii_lowercase + string.digits) for i in range(180)
)

# set the headers
headers = {
    "X-Requested-By": random_request_id,
    "Referer": "https://www.tripadvisor.com/Hotels",
    "Origin": "https://www.tripadvisor.com",
}

# get the page with method POST and pass the payload and headers as the parameters
result = await client.post(
    url="https://www.tripadvisor.com/data/graphql/ids",
    json=payload,
    headers=headers,
)

# get the results from page content
data = json.loads(result.content)
listings = data[0]["data"]['RentalInformation_legacyRentalSearch']['listings']
listings

In [None]:
rentalInfo = extract_rental_info(listings[11])
rentalInfo

{'rental_id': 24355939,
 'rental_name': 'Sonder City Hall Park | Studio Apartment',
 'bath_count': 1,
 'room_count': 0,
 'sleep_count': 2,
 'averageRatingNumber': 5,
 'url': '/VacationRentalReview-g60763-d24355939-Sonder_City_Hall_Park_Studio_Apartment-New_York_City_New_York.html',
 'hasPaymentProtection': False,
 'isAffiliate': True,
 'affiliateLogoUrl': '/img2/vacationrentals/affiliate-logos/logo-vrbo-darker.svg',
 'photos': [{'url': 'https://dynamic-media.tacdn.com/media/vr-ha-splice-j/12/1b/dd/b3.jpg',
   'description': 'Room'},
  {'url': 'https://dynamic-media.tacdn.com/media/vr-ha-splice-j/12/1b/73/0c.jpg',
   'description': None},
  {'url': 'https://dynamic-media.tacdn.com/media/vr-ha-splice-j/12/45/d3/72.jpg',
   'description': None},
  {'url': 'https://dynamic-media.tacdn.com/media/vr-ha-splice-j/12/45/d3/73.jpg',
   'description': None},
  {'url': 'https://dynamic-media.tacdn.com/media/vr-ha-splice-j/12/1b/dd/b0.jpg',
   'description': 'Private kitchen'},
  {'url': 'https://d