Some considerations for scraping:
- Are there terms and conditions for using the website?
- Test your scraping bot on in small samples to debug before scaling to hundreds, thousands or millions of requests.
- Start thinking about your IP address: getting blacklisted from a website is no fun. Consider using a VPN.
- Slow your bot down! Add delays along the way with the time package. Specifically, time.sleep(seconds) adds wait time in a program.

Can also try: Selenium and Scrapy


In [1]:
import requests
import re
from bs4 import BeautifulSoup

In [10]:
# Make a 'get' request to retrieve the page
html_page = requests.get('https://www.tripadvisor.com/Hotel_Review-g42139-d13417189-Reviews-The_Siren_Hotel-Detroit_Michigan.html#REVIEWS') 

# Pass the page contents to beautiful soup for parsing
soup = BeautifulSoup(html_page.content, 'html.parser') 

In [113]:
data = BeautifulSoup(html_page.content).find('script', text = re.compile('window.taRollupsAreAsync'))
data

''

In [14]:
# Preview the structure
print(soup.prettify())

<!DOCTYPE html>
<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <link href="https://static.tacdn.com/favicon.ico?v2" id="favicon" rel="icon" type="image/x-icon"/>
  <link color="#000000" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" rel="mask-icon" sizes="any"/>
  <meta content="#34e0a1" name="theme-color"/>
  <meta content="telephone=no" name="format-detection"/>
  <script type="text/javascript">
   window.taRollupsAreAsync = true;
  </script>
  <link crossorigin="" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans.css?v1.002" rel="stylesheet"/>
  <title>
   THE SIREN HOTEL $122 ($̶1̶8̶1̶) - Prices &amp;  Reviews - Detroit, MI - Tripadvisor
  </title>
  <meta content="TripAdvisor" property="al:ios:app_name"/>
  <meta content="284876795" property="al:ios:app_store_id"/>
  <meta content="284876795" name="twitter:app:id:ipad" property="twitte

In [50]:
# all reviews in this?
# <div class="" data-test-target="reviews-tab">
main_cont = soup.findAll('div', class_="in-ssr-only")

# Check
for i in main_cont:
    print(i.prettify())

<div class="in-ssr-only" data-ssr-done="true">
 <div>
  <div class="_3koVEFzz" data-section-signature="about" data-tab="TABS_ABOUT" id="ABOUT_TAB">
   <div class="YjbtrQBV _31WhljXe">
    <h2 class="_11J3kRI9">
     About
    </h2>
   </div>
   <div class="ui_columns _318JyS8B">
    <div class="ui_column">
     <div class="kVNDLtqL">
      <span class="_3cjYfwwQ">
       4.0
      </span>
      <a class="_1dCQBg5N" href="#REVIEWS">
       <div class="_2-OvcgvB">
        Very good
       </div>
       <span class="ui_bubble_rating bubble_40">
       </span>
       <span class="_3jEYFo-z">
        610 reviews
       </span>
      </a>
     </div>
     <span class="_28eYYeHH">
      #6 of 38 hotels in Detroit
     </span>
     <div class="_1krg1t5y">
      <span class="ui_bubble_rating bubble_50">
      </span>
      <div class="_1h7NKZWM">
       Location
      </div>
     </div>
     <div class="_1krg1t5y">
      <span class="ui_bubble_rating bubble_45">
      </span>
      <div class="

In [57]:
main_cont[0].attrs

{'class': ['in-ssr-only'], 'data-ssr-done': 'true'}

In [34]:
# Review containers 
# <div class=“_2wrUUKlw _3hFEdNs8” data-test-target=“HR_CC_Card>
review_cont = soup.findAll('div', class_="_2wrUUKlw _3hFEdNs8")

# Check (5 per page)
len(review_cont)

5

In [19]:
# looking for 'ui_bubble_rating'
# <span class="ui_bubble_rating bubble_30"></span>

# rating = bubble_rating
re_rating = re.compile("ui_bubble_rating (.*)")

In [60]:
# Select a container - then make sub-selections within it to find the relevant information
# <div class="" data-test-target="reviews-tab">
ratings = soup.findAll('span', class_=re_rating)

# Preview
ratings

AttributeError: ResultSet object has no attribute 'findAll'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [58]:
len(ratings), ratings[0].attrs

(22, {'class': ['ui_bubble_rating', 'bubble_40']})

In [84]:
ratings[0].attrs['class'][1]

'bubble_40'

In [97]:
for r in ratings:
    print(int(r.attrs['class'][1][-2:])/10)

4.0
4.0
5.0
4.5
4.5
4.0
4.5
4.5
4.5
4.5
5.0
5.0
5.0
4.5
5.0
5.0
5.0
5.0
5.0
5.0
5.0
5.0


In [52]:
# Review text
# <q class=“IRsGHoPm”>
reviews = soup.findAll('q', class_="IRsGHoPm")

reviews[0].text

# how to auto expand??????????????????????

"I have stayed at the siren multiple times - I adore it. Its gorgeous on every level with no detail overlooked. It's not basic - it's curated. When the outlets are open they are incredible-- I love Karl's for late night French fries, the candy bar for the coolest bar in town, lobby coffee shop is excellent. Highly recommend The Siren for anyone that appreciates style, substance and service. "

In [59]:
reviews[0].attrs

{'class': ['IRsGHoPm']}

In [27]:
len(reviews) == len(ratings)

False

In [66]:
test = soup.findAll('div', class_="oETBfkHU")
len(test)

5

In [95]:
test[0].find('span', class_=re_rating)

<span class="ui_bubble_rating bubble_50"></span>

In [80]:
test[0].find('q', class_="IRsGHoPm").text

"I have stayed at the siren multiple times - I adore it. Its gorgeous on every level with no detail overlooked. It's not basic - it's curated. When the outlets are open they are incredible-- I love Karl's for late night French fries, the candy bar for the coolest bar in town, lobby coffee shop is excellent. Highly recommend The Siren for anyone that appreciates style, substance and service. "

In [104]:
# Main single, Review container (all info & buttons) - "_2wrUUKlw _3hFEdNs8"
# single review container (just bubbles and review)- "oETBfkHU"
test = soup.findAll('div', class_="_2wrUUKlw _3hFEdNs8")

# <span class="ui_bubble_rating bubble_30">
# looking for "ui_bubble_rating", following any value
re_rating = re.compile("ui_bubble_rating (.*)")

ratings_lst = []
reviews_lst = []

for t in test:
    rating_raw = t.find('span', class_=re_rating)
    rating_int = int(rating_raw.attrs['class'][1][-2:-1])
    ratings_lst.append(rating_int)
    review = t.find('q', class_="IRsGHoPm").text
    reviews_lst.append(review)

print(type(ratings_lst[0]), type(reviews_lst[0])) 

ratings_lst, reviews_lst 

<class 'int'> <class 'str'>


([5, 5, 5, 5, 5],
 ["I have stayed at the siren multiple times - I adore it. Its gorgeous on every level with no detail overlooked. It's not basic - it's curated. When the outlets are open they are incredible-- I love Karl's for late night French fries, the candy bar for the coolest bar in town, lobby coffee shop is excellent. Highly recommend The Siren for anyone that appreciates style, substance and service. ",
  'Had a great time at The Siren Hotel. The lobby is one of the most AMAZING lobbies I have ever seen, looked like it was out of a scene in The Great Gatsby. The hotel bar, The Candy Bar, is very unique place with exceptional decor, design and ambience. Great for a couple’s night out. The rooms are clean and newly renovated and furnished while maintaining the historic charm of the building. Can’t beat the location, walking distance to all of Detroit’s downtown cafes, bars and restaurants. Overall a great place for a couple to spend a weekend',
  'The Siren hotel is located in 

In [106]:
# Make a 'get' request to retrieve the page
html_page2 = requests.get("https://www.tripadvisor.com/Hotel_Review-g42139-d13417189-Reviews-or10-The_Siren_Hotel-Detroit_Michigan.html")
# Pass the page contents to beautiful soup for parsing
soup2 = BeautifulSoup(html_page2.content, 'html.parser') 


test2 = soup2.findAll('div', class_="_2wrUUKlw _3hFEdNs8")

# <span class="ui_bubble_rating bubble_30">
# looking for "ui_bubble_rating", following any value
re_rating = re.compile("ui_bubble_rating (.*)")

ratings_lst2 = []
reviews_lst2 = []

for t2 in test2:
    rating_raw2 = t2.find('span', class_=re_rating)
    rating_int2 = int(rating_raw2.attrs['class'][1][-2:-1])
    ratings_lst2.append(rating_int2)
    review2 = t2.find('q', class_="IRsGHoPm").text
    reviews_lst2.append(review2)

print(type(ratings_lst2[0]), type(reviews_lst2[0])) 

ratings_lst2, reviews_lst2 

<class 'int'> <class 'str'>


([4, 5, 3, 4, 5],
 ['Had a wonderful stay! The staff was attentive and welcoming. It was a very chilly day and wish our room was a bit warmer, but the blankets were cozy and the shower hot! We ate at Karls and enjoyed a great sandwich and beer. We will be back!',
  'The warmth of the people and the atmosphere was noticeable as soon as we arrived. The lobby and the bar and the restaurant are all so cozy, cool, and comfortable and everyone working there was easy going and super friendly. The sheets, blanket, pillows and bed were each soft and felt amazing. The giant and gorgeous bathroom was a pleasure and even though the rooms were small, they felt great. The only real problem was that the t.v. there is terrible but there is so much to do around there anyway, not a huge problem. And the location of the Siren is perfect. Right next door to a people mover stop, a short walk from The Guardian, the river, the Michigan building and so many great places to eat and drink as well as so much bea

In [114]:


print(type(ratings_lst2[0]), type(reviews_lst2[0])) 

ratings_lst2, reviews_lst2 

<class 'int'> <class 'str'>


([5, 4, 5, 5, 5],
 ['The Crawford Hotel was seriously the most unique and beautiful hotels I have ever stayed in. My husband and I decided to stay here for our honeymoon and it certainly did not disappoint. From guest services answering our every need and the Tesla valet courtesy car service especially with Jamel and Riley as our drivers was amazing!!!! I can’t really say anything bad about this hotel whatsoever. Super quiet, super cozy, and outstandingly beautiful and romantic!',
  'Husband and I booked the B&B package for our 31st anniversary.  The hotel staff were warm, friendly, and professional.  Also I was able to update my reservation after a date mixup.  As mentioned by others in several reviews the hotel amenities are great and the rooms are beautifully decorated.  The experience overall was great with the exception of a few things:   1.  Being awaken early AM to loud noises which sounded like large doors being closed and metal clanging 2.  Couldn’t figure out how to order bre

In [None]:
# five reviews per page, editing '-or5-'

<a class="ui_button nav next primary " href="/Hotel_Review-g33388-d6577001-Reviews-or5-The_Crawford_Hotel-Denver_Colorado.html">Next</a>
<a class="ui_button nav next primary " href="/Hotel_Review-g33388-d6577001-Reviews-or10-The_Crawford_Hotel-Denver_Colorado.html">Next</a>

In [127]:
# Make a 'get' request to retrieve the page
html_page_last = requests.get("https://www.tripadvisor.com/Hotel_Review-g33388-d6577001-Reviews-or1360-The_Crawford_Hotel-Denver_Colorado.html")
# Pass the page contents to beautiful soup for parsing
soup_last = BeautifulSoup(html_page_last.content, 'html.parser') 

# get next page link!
next_re = re.compile("ui_button nav next (.*)")
next_page_link = soup_last.find('a', class_=next_re)
 

In [129]:
next_page_link = soup_last.find('a', class_=re.compile("ui_button nav next (.*)"))
type(next_page_link)

NoneType

In [None]:
next_page_link['href']

In [None]:
# next page buttons
<div class="ui_pagination is-centered">
<a class="ui_button nav previous secondary " href="/Hotel_Review-g33388-d6577001-Reviews-or10-The_Crawford_Hotel-Denver_Colorado.html">Previous</a>
<a class="ui_button nav next primary " href="/Hotel_Review-g33388-d6577001-Reviews-or20-The_Crawford_Hotel-Denver_Colorado.html">Next</a>
<div class="pageNumbers">
<a class="pageNum " href="/Hotel_Review-g33388-d6577001-Reviews-The_Crawford_Hotel-Denver_Colorado.html">1</a>
<a class="pageNum " href="/Hotel_Review-g33388-d6577001-Reviews-or5-The_Crawford_Hotel-Denver_Colorado.html">2</a>
<a class="pageNum " href="/Hotel_Review-g33388-d6577001-Reviews-or10-The_Crawford_Hotel-Denver_Colorado.html">3</a>
<span class="pageNum current disabled">4</span><a class="pageNum " href="/Hotel_Review-g33388-d6577001-Reviews-or20-The_Crawford_Hotel-Denver_Colorado.html">5</a>
<a class="pageNum " href="/Hotel_Review-g33388-d6577001-Reviews-or25-The_Crawford_Hotel-Denver_Colorado.html">6</a>
<span class="separator">…</span>
<a class="pageNum " href="/Hotel_Review-g33388-d6577001-Reviews-or1360-The_Crawford_Hotel-Denver_Colorado.html">273</a></div></div>

In [184]:
page_one = '/Hotel_Review-g42139-d13417189-Reviews-or595-The_Siren_Hotel-Detroit_Michigan.html#REVIEWS'
root = "https://www.tripadvisor.com"
page_120 = requests.get("https://www.tripadvisor.com" + page_one)
soup_120 = BeautifulSoup(page_120.content, 'html.parser')


def get_next_page_ext(soup):
    next_page_link = soup.find('a', class_=re.compile("ui_button nav next (.*)"))
    if next_page_link:
        return next_page_link['href']
    else:
        return None 
    
    
next_ext = get_next_page_ext(soup_120)

In [185]:
next_ext

'/Hotel_Review-g42139-d13417189-Reviews-or600-The_Siren_Hotel-Detroit_Michigan.html'

In [204]:
from time import sleep

# returns results from all pages using "next" button     
def parse_url(start_url, reviews=[], ratings=[]):
    root = "https://www.tripadvisor.com"
    try:
        html = requests.get(root + start_url)
    except:
        print("")
    soup = BeautifulSoup(html.content, 'html.parser')
    page_reviews, page_ratings = retrieve_reviews_ratings(soup)
    reviews += page_reviews
    ratings += page_ratings
    next_ext = get_next_page_ext(soup)
    
    if next_ext:  # if next_ext exists, i.e. not at end of pages yet
        i = 1
        print("Page:", i, "ext:", next_ext)
        next_url = root + next_ext
        i += 1
        return parse_url(next_url, reviews, ratings)
        # recursive, calls on self to continue with function to parse on NEXT page
    else: # at end of pages, stop looping, return final lists
        return reviews, ratings
    

In [205]:
page_one = '/Hotel_Review-g42139-d13417189-Reviews-or595-The_Siren_Hotel-Detroit_Michigan.html#REVIEWS'
reviews, ratings = parse_url(page_one, reviews, ratings)

Page: 1 ext: /Hotel_Review-g42139-d13417189-Reviews-or600-The_Siren_Hotel-Detroit_Michigan.html


ConnectionError: HTTPSConnectionPool(host='www.tripadvisor.comhttps', port=443): Max retries exceeded with url: //www.tripadvisor.com/Hotel_Review-g42139-d13417189-Reviews-or600-The_Siren_Hotel-Detroit_Michigan.html (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x1284fe1f0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
reviews, ratings

In [189]:
def retrieve_reviews_ratings(soup):
    # set container
    container = soup.findAll('div', class_="_2wrUUKlw _3hFEdNs8")

    # blank lists
    page_reviews=[]
    page_ratings=[]

    # looking for "ui_bubble_rating", following any value
    rating_re = re.compile("ui_bubble_rating (.*)")

    for item in container:
    
        # save all ratings on page as int
        rating_raw = item.find('span', class_=re_rating)
        rating_int = int(rating_raw.attrs['class'][1][-2:-1])
        page_ratings.append(rating_int)
        
        # save all reviews on page as str 
        review = item.find('q', class_="IRsGHoPm").text
        page_reviews.append(review)
        
    return page_reviews, page_ratings

In [190]:
page_reviews, page_ratings = retrieve_reviews_ratings(soup_120)

In [191]:
page_reviews

['It\'s a newly-renovated historical building turned into a hotel in the downtown. Good location. Decorations give a special vibe. I stayed at one of the rooms listed as "The Chamber". The size of the room was on the smaller side but good enough for a solo traveler. The decor of the room was really inspiring. I didn\'t get a chance to check out the candy bar but the populace coffee shop at the lobby was a delight. And to top it off, great hospitality from the polite and helpful staff made my short stay quite pleasant.',
 'I stayed at The Siren for one night during their soft opening period - they had opened 4 weeks before. I must admit I came from a long day at work and a networking event with a LOT of booze - not sure how that influenced my perception :-) Check in staff was very friendly and even pretended not to notice my state of inebriation. The room was a decent size and nicely decorated - the bathroom is stylish with a big shower. TV did not connect to the internet - I think... o

In [188]:
page_ratings

[4, 4, 3, 5, 5]

In [192]:
page_ratings + page_reviews

[4,
 4,
 3,
 5,
 5,
 'It\'s a newly-renovated historical building turned into a hotel in the downtown. Good location. Decorations give a special vibe. I stayed at one of the rooms listed as "The Chamber". The size of the room was on the smaller side but good enough for a solo traveler. The decor of the room was really inspiring. I didn\'t get a chance to check out the candy bar but the populace coffee shop at the lobby was a delight. And to top it off, great hospitality from the polite and helpful staff made my short stay quite pleasant.',
 'I stayed at The Siren for one night during their soft opening period - they had opened 4 weeks before. I must admit I came from a long day at work and a networking event with a LOT of booze - not sure how that influenced my perception :-) Check in staff was very friendly and even pretended not to notice my state of inebriation. The room was a decent size and nicely decorated - the bathroom is stylish with a big shower. TV did not connect to the int

In [115]:
import pandas as pd
df = pd.DataFrame([ratings_lst2, reviews_lst2, location2]).transpose()
df.columns = ['Rating', 'Review', 'Location']
df

Unnamed: 0,Rating,Review
0,5,The Crawford Hotel was seriously the most uniq...
1,4,Husband and I booked the B&B package for our 3...
2,5,"We traveled from Emeryville, CA to Denver on t..."
3,5,We stayed at The Crawford the weekend before T...
4,5,Thoroughly enjoyed my stay at The Crawford! We...


In [None]:
# Define a master container with all of the items of interest

In [None]:
# Then, make a selection, preview it, and continue slicing down until you have what you're after

In [None]:
# repeatable!!!!!! (ex from lab)
final_titles = [h3.find('a').attrs['title'] for h3 in book_container.findAll('h3')]
print(len(final_titles), final_titles[:5])

In [None]:
# list all ratings per page
star_ratings = []
for p in book_container.findAll('p', {"class" : regex}):
    star_ratings.append(p.attrs['class'][-1])
star_ratings

In [None]:
# replace with Int
star_dict = {'One': 1, 'Two': 2, 'Three':3, 'Four': 4, 'Five':5} # Manually create a dictionary to translate to numeric
star_ratings = [star_dict[s] for s in star_ratings]
star_ratings

In [None]:


# save

In [None]:
# paginate to rest of review pages



# concat all properties df






# EXTERNAL

In [None]:
# HIDDEN!!!!!!!
# Import Statements




Grab the attraction titles for one city¶
In [3]:
html_london = requests.get('https://www.tripadvisor.com/Attraction_Products-g186338-zfg12131-London_England.html')
soup = BeautifulSoup(html_london.content, 'html.parser')
In [4]:
soup.prettify


container = soup.find('div', id="ATTRACTION_PRODUCTS_LIST")

attractions = container.find_all('h2')

# find data location looking for within html
attractions[0].find('a').attrs['title']

# loop through entire page
final_attractions = []
for attraction in attractions:
    att = attraction.find('a')
    if hasattr(att, 'title'):
        final_attractions.append(att.attrs['title'])

# confirm correct
final_attractions[:15]

# build function to repeat
 def get_attractions(html_path):
    """
    input html path from TripAdvisor as a string
    returns list of attraction names from that page
    """
    html = requests.get(html_path)
    soup = BeautifulSoup(html.content, 'html.parser')
    container = soup.find('div', id="ATTRACTION_PRODUCTS_LIST")
    attractions = container.findAll('h2')
    final_attractions = []
    for attraction in attractions:
        att = attraction.find('a')
        if hasattr(att, 'title'):
            final_attractions.append(att.attrs['title'])
    return final_attractions



    # Get all attractions from all 94 pages
# Figure out the pattern
page_1 = 'http://tripadvisor.com/Attraction_Products-g186338-a_sort.' + '-London_England.html#ATTRACTION_LIST'
page_2 = 'https://www.tripadvisor.com/Attraction_Products-g186338-a_sort.-oa30-London_England.html#ATTRACTION_LIST'
page_3 = 'https://www.tripadvisor.com/Attraction_Products-g186338-a_sort.-oa60-London_England.html#ATTRACTION_LIST'

# Test pattern
base = 'http://tripadvisor.com/Attraction_Products-g186338-a_sort.'
page = '-oa' + str(30)
end = '-London_England.html#ATTRACTION_LIST'

page_2 = base + page + end
page_2



# get ALL 
def get_all_attractions(html_list):
    """
    input list of strings of html paths from TripAdvisor
    returns list of attraction names from all pages
    """
    final_attractions = []
    for html in html_list:
        html_ = requests.get(html)
        soup = BeautifulSoup(html_.content, 'html.parser')
        container = soup.find('div', id="ATTRACTION_PRODUCTS_LIST")
        attractions = container.findAll('h2')
        new_attractions = []
        for attraction in attractions:
            att = attraction.find('a')
            if hasattr(att, 'title'):
                new_attractions.append(att.attrs['title'])
        final_attractions.extend(new_attractions)
    return final_attractions


# Warning! Takes a few mins to run
all_london_attractions = get_all_attractions(london_htmls)