# Webscrap Museum Data from Tripadvisor
Author: Anne Chen  
2016

#### Import Modules

In [527]:
# import modules
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
import json
import googlemaps
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from geopy.geocoders import Nominatim

# run the command below in terminal to install missing module
# ./anaconda2/bin/pip install module_name

#### Define Getter Functions

In [939]:
def create_url_lst(url_head):
    '''generate a list of urls for scraping'''
    num_lst = range(0,991,30)
    url_lst = map(lambda x: url_head+str(x), num_lst)
    return url_lst

In [947]:
def create_master_soup(url_lst):
    '''create a list of beautiful soup object for each page'''
    search_soup_lst = []
    # create driver for website
    driver = webdriver.Chrome('/Users/annecool37/Documents/chromedriver')
    for url in url_lst: 
        driver.get(url)
        # time.sleep(1)
        html = driver.page_source
        search_soup_lst.append(BeautifulSoup(html, 'lxml'))
    return search_soup_lst

In [949]:
def get_museum_soup(search_soup):
    '''create soup object for all museums'''
    museum_result = search_soup.find_all('div',{'class':'result ATTRACTIONS'})
    target_lst = [link.find('div', {'class':'title'})['onclick'] for link in museum_result ]
    head = "https://www.tripadvisor.com"
    url_lst = [head + target.split(",")[-1][2:-2] for target in target_lst]
    driver = webdriver.Chrome('/Users/annecool37/Documents/chromedriver')
    museum_soup_lst = []
    for url in url_lst:
        driver.get(url)
        time.sleep(1)
        html = driver.page_source
        museum_soup_lst.append(BeautifulSoup(html, 'lxml'))
    return museum_soup_lst

In [882]:
def get_museum_name(museum_soup):
    '''get museum name'''
    return [soup.find('h1', {'id':'HEADING'}).getText().strip('\n') for soup in museum_soup]
def get_review_count(museum_soup):
    '''get review count'''
    return [soup.find('a', {'href':'#REVIEWS'}).getText().split()[0] for soup in museum_soup]
def get_address(museum_soup):
    '''get address'''
    return [soup.find('span', {'property':'address'}).getText().strip('\n').rstrip()[9:]\
            for soup in museum_soup]

In [685]:
def get_rating(museum_soup):
    '''get the rating of the museum'''
    rating_lst = [ soup.find('span', {'class':'rate sprite-rating_rr rating_rr'})\
            .find('img').get('alt')[:3] for soup in museum_soup]
    return rating_lst 

In [703]:
def get_rank_total(museum_soup): 
    '''get the ranking of a museums and number of total things to do in the city'''
    rank_tags = [soup.find('div', {'class': 'slim_ranking'}) for soup in museum_soup]
    lst = [tag.getText().split() for tag in rank_tags]
    rank_lst = [i[0][1:] for i in lst]
    total_things_to_do_lst = [i[2] for i in lst]
    return rank_lst, total_things_to_do_lst

In [716]:
def get_heading_details(museum_soup):
    '''exract section from heading detials that use to get category and check if featured'''
    categories = [soup.find('div', {'class', 'heading_details'}) for soup in museum_soup]
    return categories
    
def get_category(categories):  
    '''get the category of the museum'''
    tags_lst = [tag.find_all('a', {'href':re.compile("Attractions")}) for tag in categories]
    categories_nested_lst = [[tag.getText() for tag in item] for item in tags_lst]
    return categories_nested_lst 

def if_featured_count(categories):
    '''check the number of guide where the museum is being featured'''
    featured_in_guide_count_lst =[]
    for item in categories:
        try: 
            features = item.find_all('a', {'onclick': re.compile("As_Featured_In_Guide")}) 
            if len(features) == 1:
                featured_in_guide_count_lst.append(1)
            else:
                featured_in_guide_count_lst.append(1 + int(features[1].getText().split()[0]))
        except:
            featured_in_guide_count_lst.append(0)
    return featured_in_guide_count_lst 

In [920]:
def get_phone_num(museum_soup):
    '''record the phone number of the museum'''    
    phone_tags = [soup.find('div', {'class': 'phoneNumber'}) for soup in museum_soup]
    phone_num_lst = []
    for tag in phone_tags:
        try: 
            phone_num_lst.append(tag.getText().split(": ")[1])
        except:
            phone_num_lst.append('NA')   
    return phone_num_lst

In [723]:
def check_fee(museum_soup):
    '''check if the musuem requires fee'''
    details_tags = [soup.find_all('div', {'class': 'detail_section details'}) for soup in museum_soup]
    fee_lst = []
    for detail in details_tags:
        try:
            fee = detail[0].find_all('div', {'class': 'detail'})[1].getText().split(": ")[1]
            fee_lst.append(fee)
        except:
            fee_lst.append('NA')
    return fee_lst

In [730]:
def get_description(museum_soup):
    '''get the description of the museum'''
    details_tags = [soup.find_all('div', {'class': 'detail_section details'}) for soup in museum_soup]
    description_lst = []
    for detail in details_tags:
        try: 
            description = [i.find_all('p') for i in detail][-1][1].getText()
            description_lst.append(description)
        except:
            description_lst.append('NA')
    return description_lst

In [731]:
def get_length_of_visit(museum_soup):
    '''get the recommended length of visit for a museum'''
    details_tags = [soup.find_all('div', {'class': 'detail_section details'}) for soup in museum_soup]
    length_of_visit_lst = []
    for detail in details_tags:
        lst = [i.find('div', {'class': 'detail'}) for i in detail]
        length_of_visit = [i.getText().split(": ")[1] for i in lst if i != None]
        if len(length_of_visit) != 0:
            length_of_visit_lst.append(length_of_visit[0])
        else: 
            length_of_visit_lst.append('NA')
    return length_of_visit_lst

In [740]:
def get_review_quotes(museum_soup):
    '''get review quotes of a museum'''
    quote_tags = [soup.find_all('span', {'class': 'noQuotes'}) for soup in museum_soup]
    quote_lst = [ [quote.getText() for quote in tag] for tag in quote_tags]
    return quote_lst

In [747]:
def get_partial_review(museum_soup):
    '''get the latest 10 reviews (partial) for a museum '''
    review_tags = [soup.find_all('p', {'class': 'partial_entry'})for soup in museum_soup]
    review_lst = [[review.getText() for review in tag] for tag in review_tags]
    return review_lst

In [748]:
def get_review_tag_cloud(museum_soup):
    '''get review tag clouds provided by trip advisor'''
    cloud_tags = [ soup.find_all('span', {'class':'ui_tagcloud fl'}) for soup in museum_soup]
    tagcloud_lst = [[cloud.getText() for cloud in tag] for tag in cloud_tags]
    return tagcloud_lst

In [767]:
def get_rating_details(museum_soup):
    '''get the vote of rating for each level: Excellent, Very good, Average, Poor, Terrible'''
    traverler_rating_tags = [soup.find('div', {'id': 'ratingFilter'}) for soup in museum_soup]
    rate_tags = [tag.find_all('label') for tag in traverler_rating_tags]
    traverler_rating_lst = []
    for tag in rate_tags:
        lst = [rate('span') for rate in tag]
        traverler_rating_lst.append([i[0].getText()[1:-2] for i in lst])
    return traverler_rating_lst
# Excellent, Very good, Average, Poor, Terrible

In [768]:
def get_traveler_type(museum_soup):
    '''get number of traveler type in each group: Families, Couples, Solo, Business, Friends'''
    traveler_type_tags = [soup.find('div', {'class': 'col segment extraWidth'}) for soup in museum_soup]
    label_tags = [tag.find_all('label') for tag in traveler_type_tags]
    traveler_type_lst = []
    for tag in label_tags:
        lst = [label('span') for label in tag]
        traveler_type_lst.append([i[0].getText()[1:-1] for i in lst])
    return traveler_type_lst
# Families, Couples, Solo, Business, Friends

In [1032]:
###########################################
# get latitude and longitude from address #
###########################################
# documentation
# https://github.com/googlemaps/google-maps-services-python/blob/master/README.md
# enable the API on console
# https://console.developers.google.com/apis/dashboard?project=my-trip-142904&duration=PT1H
# get access to API
def get_lat_lng(add_lst):
    '''get latitube and longtitube of address'''
    gmaps = googlemaps.Client(key='AIzaSyAp4nKWDK7gL4hMqm-uPy0S49UMcU3Mqr4')
    lat_lst = []
    lng_lst = []
    for address in add_lst:
        geocode_result = gmaps.geocode(address)
        try:
            lat_lst.append(geocode_result[0]['geometry']['location']['lat'])
            lng_lst.append(geocode_result[0]['geometry']['location']['lng'])
        except:
            lat_lst.append('NA')
            lng_lst.append('NA')
    return lat_lst, lng_lst

#### Define Utility Functions

In [662]:
def flatten_lst(lst):
    '''flatten the nested list'''
    return [item for sublist in lst for item in sublist]

In [936]:
def write_json(name, which, dic):
    '''write dictionary to json file'''
    filename = name + which + '.json'
    with open(filename, 'w') as f:
        json.dump(dic, f)

In [770]:
def unicode_to_ascii(lst):
    '''convert unicode to ascii'''
    # avoid raising errors later on while writing data into csv files
    return [item.encode('ascii', 'ignore') for item in lst]

In [772]:
def to_dict(key_lst, value_lst):
    '''create dictionary to flatten nested list'''
    dic = {}
    for i in range(0,len(key_lst)):
        dic[key_lst[i]] = value_lst[i]
    return dic

#### Define Main Execution Functions

In [1033]:
def get_soup(url_head):
    url_lst = create_url_lst(url_head)
    search_soup_lst = create_master_soup(url_lst)
    return search_soup_lst

def get_data_and_save_stepwise(search_soup_lst, which):
    # initialize the list
    museum_name_lst= []
    review_count_lst=[]
    address_lst=[]
    lat_lst=[]
    lng_lst=[]
    rating_lst=[]
    rank_lst=[]
    total_things_to_do_lst=[]
    categories_nested_lst=[]
    featured_in_guide_count_lst=[]
    phone_num_lst=[]
    fee_lst=[]
    description_lst=[]
    length_of_visit_lst=[]
    quote_lst=[]
    review_content_lst=[]
    tagcloud_lst=[]
    traverler_rating_lst=[]
    traveler_type_lst=[]
    # save result for each page ran to prevent no result scenario caused by timeoutexception
    for idx, search_soup in enumerate(search_soup_lst):
        print "running page "+ str(idx+1)
        museum_soup = get_museum_soup(search_soup)
        museum_name_lst += get_museum_name(museum_soup) 
        review_count_lst += get_review_count(museum_soup)
        add_lst = get_address(museum_soup)
        address_lst += add_lst
        lat_lst_, lng_lst_ = get_lat_lng(add_lst)
        lat_lst += lat_lst_
        lng_lst += lng_lst_
        rating_lst +=  get_rating(museum_soup) 
        rank_lst_, total_things_to_do_lst_ = get_rank_total(museum_soup)
        rank_lst += rank_lst_
        total_things_to_do_lst += total_things_to_do_lst_
        categories = get_heading_details(museum_soup)
        categories_nested_lst += get_category(categories)
        featured_in_guide_count_lst += if_featured_count(categories)
        phone_num_lst += get_phone_num(museum_soup)
        fee_lst += check_fee(museum_soup)
        description_lst += get_description(museum_soup)
        length_of_visit_lst += get_length_of_visit(museum_soup)
        quote_lst += get_review_quotes(museum_soup)
        review_content_lst += get_partial_review(museum_soup)
        tagcloud_lst += get_review_tag_cloud(museum_soup)
        traverler_rating_lst += get_rating_details(museum_soup)
        traveler_type_lst += get_traveler_type(museum_soup)
        
        # create dictionary
        museum_dict = {'MuseumName': museum_name_lst, 'ReviewCount': review_count_lst,
                   'Address':address_lst, 'Latitude':lat_lst, 'Langtitude':lng_lst, 
                   'Rating':rating_lst, 'Rank':rank_lst, 'TotalThingsToDo': total_things_to_do_lst, 
                   'FeatureCount':featured_in_guide_count_lst, 'PhoneNum':phone_num_lst, 'Fee':fee_lst,
                   'Description':description_lst, 'LengthOfVisit':length_of_visit_lst}

        # convert unicode to ascii
        museum_dict['MuseumName'] = unicode_to_ascii(museum_dict['MuseumName'])
        museum_dict['Address'] = unicode_to_ascii(museum_dict['Address'])
        museum_dict['Description'] = unicode_to_ascii(museum_dict['Description'])
        museum_dict['PhoneNum'] = unicode_to_ascii(museum_dict['PhoneNum'])

        # convert dictionary to dataframe
        museum_df = pd.DataFrame(museum_dict)

        # save file as .csv
        museum_df.to_csv('tripadvisor_museum'+ which +'.csv')

        # convert nested list to dictionary with museum name as key value
        # and write json file for all nested lists
        category_dict = to_dict(museum_name_lst , categories_nested_lst)
        write_json('museum_categories', which, category_dict)

        review_content_dict = to_dict(museum_name_lst , review_content_lst)
        write_json('review_content', which, review_content_dict)

        tagcloud_dict = to_dict(museum_name_lst , tagcloud_lst)
        write_json('tag_clouds', which, tagcloud_dict)

        traverler_rating_dict = to_dict(museum_name_lst , traverler_rating_lst)
        write_json('traverler_rating', which, traverler_rating_dict)

        traverler_type_dict = to_dict(museum_name_lst , traveler_type_lst)
        write_json('traverler_type', which, traverler_type_dict)

        quote_dict = to_dict(museum_name_lst , quote_lst)
        write_json('review_quote', which, quote_dict)
    

#### Main Code 

In [None]:
# USA Museums
us_url_head = 'https://www.tripadvisor.com/Search?geo=191&pid=3826&typeaheadRedirect=true&redirect=&startTime=1473385326897&uiOrigin=MASTHEAD&q=museum&returnTo=__2F__&searchSessionId=EB567A2D74F417B6B37A45E17691A08B1473370920190ssid#&o='
us_soup_lst = get_soup(us_url_head)
get_data_and_save_stepwise(us_soup_lst, '_USonly')

# World Museums 
world_url_head = 'https://www.tripadvisor.com/Search?geo=&pid=3826&typeaheadRedirect=true&redirect=&startTime=1473447418090&uiOrigin=MASTHEAD&q=museum&returnTo=__2F__&searchSessionId=EB567A2D74F417B6B37A45E17691A08B1473433016534ssid#&o='
world_soup_lst = get_soup(world_url_head)
get_data_and_save_stepwise(world_soup_lst, '_world')

#### Miscellaneous

In [986]:
# global gmaps
# gmaps = googlemaps.Client(key='AIzaSyAp4nKWDK7gL4hMqm-uPy0S49UMcU3Mqr4')

In [943]:
###################################################################################
### These functions worked however there's a high chance of having TimeOutError ###
### so stepwise functions were then constructed and implemented above           ###
###################################################################################

# def get_museum_soup_lst(url_lst):
#     '''create the master list storing all museum soups'''
#     # create the soups for each search page
#     search_soup_lst = create_master_soup(url_lst)
#     # create soup for each musem
#     master_museum_soup_lst = [get_museum_soup(search_soup) for search_soup in search_soup_lst]
#     # flatten the nested list of  museum soup
#     flatten_museum_soup_lst = flatten_lst(master_museum_soup_lst)
#     return flatten_museum_soup_lst

# def get_and_save_data(flatten_museum_soup_lst, which):
#     # get data from each museum
#     print "start collecting data"
#     museum_name_lst = get_museum_name(flatten_museum_soup_lst)
#     review_count_lst = get_review_count(flatten_museum_soup_lst)
#     address_lst = get_address(flatten_museum_soup_lst)
#     lat_lst, lng_lst = get_lat_lng(address_lst)
#     rating_lst = get_rating(flatten_museum_soup_lst) 
#     rank_lst, total_things_to_do_lst = get_rank_total(flatten_museum_soup_lst)
#     categories = get_heading_details(flatten_museum_soup_lst)
#     categories_nested_lst = get_category(categories)
#     featured_in_guide_count_lst = if_featured_count(categories)
#     phone_num_lst = get_phone_num(flatten_museum_soup_lst)
#     fee_lst = check_fee(flatten_museum_soup_lst)
#     description_lst = get_description(flatten_museum_soup_lst)
#     length_of_visit_lst = get_length_of_visit(flatten_museum_soup_lst)
#     quote_lst = get_review_quotes(flatten_museum_soup_lst)
#     review_content_lst = get_partial_review(flatten_museum_soup_lst)
#     tagcloud_lst = get_review_tag_cloud(flatten_museum_soup_lst)
#     traverler_rating_lst = get_rating_details(flatten_museum_soup_lst)
#     traveler_type_lst = get_traveler_type(flatten_museum_soup_lst)
#     print "finished collecting data"
    
#     museum_dict = {'MuseumName': museum_name_lst, 'ReviewCount': review_count_lst,
#                    'Address':address_lst, 'Latitude':lat_lst, 'Langtitude':lng_lst, 
#                    'Rating':rating_lst, 'Rank':rank_lst, 'TotalThingsToDo': total_things_to_do_lst, 
#                    'FeatureCount':featured_in_guide_count_lst, 'PhoneNum':phone_num_lst, 'Fee':fee_lst,
#                    'Description':description_lst, 'LengthOfVisit':length_of_visit_lst}

#     museum_dict['MuseumName'] = unicode_to_ascii(museum_dict['MuseumName'])
#     museum_dict['Address'] = unicode_to_ascii(museum_dict['Address'])
#     museum_dict['Description'] = unicode_to_ascii(museum_dict['Description'])
#     museum_dict['PhoneNum'] = unicode_to_ascii(museum_dict['PhoneNum'])

#     # convert the dictionary into dataframe
#     museum_df = pd.DataFrame(museum_dict)
      
#     # save file to .csv
#     print "save result to csv"
#     museum_df.to_csv('tripadvisor_museum'+ which +'.csv')
    
#     print "creating json files for nested lists...."
#     # convert nested list into dictionary with museum name as key value
#     # and write json file for all nested lists
#     category_dict = to_dict(museum_name_lst , categories_nested_lst)
#     write_json('museum_categories', which, category_dict)

#     review_content_dict = to_dict(museum_name_lst , review_content_lst)
#     write_json('review_content', which, review_content_dict)

#     tagcloud_dict = to_dict(museum_name_lst , tagcloud_lst)
#     write_json('tag_clouds', which, tagcloud_dict)

#     traverler_rating_dict = to_dict(museum_name_lst , traverler_rating_lst)
#     write_json('traverler_rating', which, traverler_rating_dict)

#     traverler_type_dict = to_dict(museum_name_lst , traveler_type_lst)
#     write_json('traverler_type', which, traverler_type_dict)

#     quote_dict = to_dict(museum_name_lst , quote_lst)
#     write_json('review_quote', which, quote_dict)
    
#     print "done!"

In [1016]:
#########################
#########################
# Execution codes for   #
# non-stepwise function #
#########################
#########################
# us_url_head = 'https://www.tripadvisor.com/Search?geo=191&pid=3826&typeaheadRedirect=true&redirect=&startTime=1473385326897&uiOrigin=MASTHEAD&q=museum&returnTo=__2F__&searchSessionId=EB567A2D74F417B6B37A45E17691A08B1473370920190ssid#&o='
# world_url_head = 'https://www.tripadvisor.com/Search?geo=&pid=3826&typeaheadRedirect=true&redirect=&startTime=1473447418090&uiOrigin=MASTHEAD&q=museum&returnTo=__2F__&searchSessionId=EB567A2D74F417B6B37A45E17691A08B1473433016534ssid#&o='
# nyc_url_head = 'https://www.tripadvisor.com/Search?geo=28953&pid=3826&typeaheadRedirect=true&redirect=&startTime=1473458802532&uiOrigin=MASTHEAD&q=museum&returnTo=__2F__&searchSessionId=EB567A2D74F417B6B37A45E17691A08B1473444400354ssid#&o='

     
# ### USA ###
# # create url list for US muesums
# url_lst = create_url_lst(us_url_head)
# # get the master list containing all USA museum soups
# flatten_museum_soup_lst = get_museum_soup_lst(url_lst)
# # save data
# get_and_save_data(flatten_museum_soup_lst, '_USonly')

# ### WORLDWIDE ###
# # create url list for worldwide mueseum (but only 34 pages are avaiable....)
# world_url_lst = create_url_lst(world_url_head)
# # get the master list containing all USA museum soups
# world_museum_soup_lst = get_museum_soup_lst(world_url_lst)
# # save data
# get_and_save_data(world_museum_soup_lst,'_world')
# # ### TimeoutException ###

### NYC ###
# nyc_url_lst = create_url_lst(nyc_url_head)
# nyc_museum_soup_lst = get_museum_soup_lst(nyc_url_lst)
# get_and_save_data(nyc_museum_soup_lst,'_nyc')

In [681]:
# ######################################
# # get the primary type of the museum #
# ######################################
# def get_primary_type(search_soup):
#     '''get the primary type of the museum'''
#     prime_type_result = search_soup.find_all('div',{'class':'type'})
#     prime_type_lst = [item.find('span').getText() for item in prime_type_result]
#     return prime_type_lst

In [684]:
# ################################
# # get image html of the museum #
# ################################
# def get_img_html(search_soup):
#     '''get the museum image'''
#     class_tags = search_soup.find_all('div', {'class':'sizedThumb'})
#     img_html_lst = [tag.find('img')['src'] for tag in class_tags]
#     return img_html_lst