In [2]:
# Import BeautifulSoup and splinter
from bs4 import BeautifulSoup as soup
# from bs4 import SoupStrainer
from splinter import Browser

# Import other dependencies
import time
import pandas as pd
import numpy as np
from pathlib import Path
import requests
import json

# Import geoapify api key
from config import geoapify_key

In [None]:
# geoapify url endpoint
base_url = "https://api.geoapify.com/v1/geocode/search"

# List of urls and category used for scraping
url_list = []
category_list = []

url_list.append('https://www.tripadvisor.com/TravelersChoice-Beaches')
category_list.append('beaches')

url_list.append("https://www.tripadvisor.com/TravelersChoice-ThingsToDo")
category_list.append("things")

# url_list.append("https://www.tripadvisor.com/TravelersChoice-Restaurants")
# category_list.append("restaurants")


In [None]:
# loop thru url list that we need to scrape and get data (done only if csv of data is not present)

for index, item in enumerate(url_list):
    
    search_file = "static/data/" + category_list[index] + ".csv"
    csv_file = Path(search_file)

    if csv_file.exists():
        print(f"File {csv_file} already extracted. Skipping scrapping")
    else:
                
        # Set up Splinter
        browser = Browser('chrome')

        # Visit the website - sleep provided to avoid continuous calls   
        browser.visit(item)
        time.sleep(30)       
        # Optional delay for loading the page
        browser.is_element_present_by_css('div.list_text', wait_time=1)

        # Scrape the website html
        html = browser.html

        # Create a BeautifulSoup object from the scraped HTML
        data = soup(html, 'html.parser')

        # Get Rank and name
        names = data.find_all(class_="mainName extra")
        namelist = []
        ranklist = []
        parts = []
        for i in names:
            href=i.find("a")    
            text_data=href.text   
            parts = text_data.split('.')    
            ranklist.append(parts[0])
            namelist.append(parts[1].strip())
    
        # Get City and Country (single string)
        cities = data.find_all(class_="smaller")
        citylist = []
        lat = []
        lon = []
        for i in cities:
            href=i.find("a")
            city_loc = href.text    
            citylist.append(city_loc)
            
            # call geoapify to get lat/lon for location
            params = {
                "text": city_loc,
                "apiKey": geoapify_key }
            # Run request
            response = requests.get(base_url, params=params).json()
            
            # Extract lat/lon
            latitude = response["features"][0]["properties"]["lat"]
            longitude = response["features"][0]["properties"]["lon"]

            # append to list
            lat.append(latitude)
            lon.append(longitude)
    

        # Get image urls
        images = data.find_all(class_="sizedThumb_container")
        iurls = []
        for i in images:
            href=i.find("img")
            iurls.append(href["src"])

        # Get description of destination/a customer review 
        desclist = []
        for texts in data.find_all(class_="quot"):    
            x = texts.find("i").next_sibling.strip()        
            desclist.append(x)

        # Get url to go retrieve rating and reviews - this will be used later to get details
        url_ary = []
        for lnk in data.find_all(class_="firstone"):
            href=lnk.find("a")    
            url_ary.append(href['href'])

        # Close browser
        browser.quit() 

        # Create array with category and count of rank array
        cat_list = np.repeat([category_list[index]], len(ranklist))

        # Load arrays as columns of dataframe
        df = pd.DataFrame({"category": cat_list, "rank": ranklist, "name": namelist, "location": citylist, 
                       "imageurl": iurls, "description": desclist, "latitude": lat, "longitude": lon, "ratingurl": url_ary})
        
        # save dataframe as csv 
        filename = "static/data/" + category_list[index] + ".csv"
        df.to_csv(filename, encoding="utf-8", index=False, header=True)
        

    # sleep for a minute before calling next url (providing break to avoid continuous pings to website)
    time.sleep(10)

# Final checks
browser.quit()

In [27]:
# Get rating and misc info for all restaurants
# prefix = "https://www.tripadvisor.com"
# url = prefix + "/Restaurant_Review-g186319-d1209702-Reviews-The_Old_Stamp_House_Restaurant-Ambleside_Lake_District_Cumbria_England.html"
url = "https://www.tripadvisor.com/Attraction_Review-g10006284-d148331-Reviews-Grace_Bay_Beach-Grace_Bay_Providenciales_Turks_and_Caicos.html"

# setup Splinter
browser = Browser('chrome')

# Visit the website - sleep provided to avoid continuous calls   
browser.visit(url)
time.sleep(30)

# Scrape the website html
html = browser.html
# specify select section
# parse_section = SoupStrainer(id=['component_46'])
# parse_section = SoupStrainer('div', attrs={'class_': 'SrqKb'})

# Create a BeautifulSoup object from the scraped HTML
# raters = soup(html, 'html.parser', parse_only=parse_section)
raters = soup(html, 'html.parser')
browser.quit()

In [7]:
# get overall rating (has decimals)
rating_text = raters.find('span', {'class_': "ZDEqb"}).text
rating = float(rating_text)

# get review count
rcount_text = raters.find('a', {'class_': 'IcelI'}).text
# remove all ',' separators and cast as int
rcount_text = rcount_text.replace(' reviews', '')
rcount = int(rcount_text.replace(',',''))  #also remove ' reviews'

# get the different types of ratings and their values (repeating 4 times to get 4 rating types)
rtype = []
rvalue = []
div1 = raters.find('div', {'class_': 'DzMcu'})
rtype.append(div1.find_next('span', {'class_': 'BPsyj'})).text
span1 = div1.find_next('span', {'class_': 'vzATR'})
t1 = span1.find_next('span')
tx = t1['data-fmid']
n1 = float(tx[-2:])
n1 = n1/10
rvalue.append(n1)

div2 = div1.find_next('div', {'class_': 'DzMcu'})
rtype.append(div2.find_next('span', {'class_': 'BPsyj'})).text
span2 = div2.find_next('span', {'class_': 'vzATR'})
t1 = span2.find_next('span')
tx = t1['data-fmid']
n1 = float(tx[-2:])
n1 = n1/10
rvalue.append(n1)

div3 = div2.find_next('div', {'class_': 'DzMcu'})
rtype.append(div3.find_next('span', {'class_': 'BPsyj'})).text
span3 = div3.find_next('span', {'class_': 'vzATR'})
t1 = span3.find_next('span')
tx = t1['data-fmid']
n1 = float(tx[-2:])
n1 = n1/10
rvalue.append(n1)

div4 = div3.find('div', {'class_': 'DzMcu'})
rtype.append(div4.find_next('span', {'class_': 'BPsyj'})).text
span4 = div4.find_next('span', {'class_': 'vzATR'})
t1 = span4.find_next('span')
tx = t1['data-fmid']
n1 = float(tx[-2:])
n1 = n1/10
rvalue.append(n1)

# get price range
prange = raters.find('div', {'class_': 'SrqKb'}).text
#get cuisines
cuisines = raters.find('div', {'class_': 'SrqKb'}).text

AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
with open('htmldat.txt', 'w') as f:
    f.write(str(raters))

In [10]:
with open('htmldat.txt') as f:
    s1 = soup(f, 'html.parser')
    

In [24]:
x1 = s1.find('span', {'class': 'row_num  is-shown-at-tablet'})
print(x1)

None


In [30]:
w = raters.find('div', {'class_': 'biGQs _P fiohW hzzSG uuBRH'})
print(w)

None
