# Master Data Science for Business - Data Science Consulting - Session 2 

# Notebook 3: 

# Web Scraping with Scrapy: Getting reviews from TripAdvisor

## 1. Importing packages

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
import sys
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import json
import logging
import pandas as pd

## 2. Some class and functions

In [2]:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

class HotelreviewsItem(scrapy.Item):
    # define the fields for your item here like:
    rating = scrapy.Field()
    review = scrapy.Field()
    title = scrapy.Field()
    trip_date = scrapy.Field()
    trip_type = scrapy.Field()
    published_date = scrapy.Field()
    image_url = scrapy.Field()
    hotel_type = scrapy.Field()
    hotel_name = scrapy.Field()
    hotel_adress = scrapy.Field()
    price_range = scrapy.Field()
    reviewer_id = scrapy.Field()
    review_id = scrapy.Field()
    review_language = scrapy.Field()
    pid = scrapy.Field()
    locid = scrapy.Field()

In [3]:
def user_info_splitter(raw_user_info):
    """

    :param raw_user_info:
    :return:
    """

    user_info = {}

    splited_info = raw_user_info.split()
    for element in splited_info:
        converted_element = get_convertible_elements_as_dic(element)
        if converted_element:
            user_info[converted_element[0]] = converted_element[1]

    return user_info

## 2. Creating the JSon pipeline 

In [4]:
#JSon pipeline, you can rename the "trust.jl" to the name of your choice
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('tripadvisorenfinal1.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## 3. Spider

Now you know how to get data from one page, we want to automate the spider so it will crawl through all pages of reviews, ending with a full spider able to scrape every reviews of the selected parc. You will modify here the parse function since this is where you tell the spider to get the links and to follow them. <br>
<b>To Do</b>: Complete the following code, to scrape all the reviews of one parc. 

In [5]:
class MySpider(CrawlSpider):
    name = 'BasicSpider'
    domain_url = "https://www.tripadvisor.com"
    # allowed_domains = ["https://www.tripadvisor.com"]

    start_urls = [
        "https://www.tripadvisor.com/Hotel_Review-g209964-d523986-Reviews-or6725-Center_Parcs_Longleat_Forest-Warminster_Wiltshire_England.html#REVIEWS"]
    
        #Custom settings to modify settings usually found in the settings.py file 
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'tripadvisorenfinal1.json',                        # Used for pipeline 2
        'DOWNLOAD_DELAY' : 2   
    }

    def parse(self, response):

        
        next_reviews_page_url = "https://www.tripadvisor.com" + response.xpath(
            "//a[contains(@class,'nav') and contains(@class,'next') and contains(@class,'primary')]/@href").extract()[0]
        all_review_pages = response.xpath(
            "//a[contains(@class,'pageNum') and contains(@class,'last')]/@data-offset").extract()

        next_page_number = eval(response.xpath(
            "//a[contains(@class,'nav') and contains(@class,'next') and contains(@class,'primary')]/@data-page-number").extract()[
                                    0])

        if next_page_number:
            yield scrapy.Request(next_reviews_page_url, callback=self.parse)

        review_urls = []
        for partial_review_url in response.xpath("//div[contains(@class,'quote')]/a/@href").extract():
            review_url = response.urljoin(partial_review_url)
            if review_url not in review_urls:
                review_urls.append(review_url)

            yield scrapy.Request(review_url, callback=self.parse_review_page)

    def parse_review_page(self, response):

        item = HotelreviewsItem()

        item["reviewer_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-memberid").extract()),
                                   None)
        item["review_language"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-language").extract()),
                                       None)
        item["review_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-reviewid").extract()),
                                 None)
        item["review_id"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-reviewid").extract()),
                                 None)

        item["pid"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-pid").extract()),
                           None)
        item["locid"] = next(iter(response.xpath(
            "//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]/span/@data-locid").extract()),
                             None)

        review_id = item["review_id"]
        review_url_on_page = response.xpath('//script[@type="application/ld+json"]/text()').extract()
        review = eval(review_url_on_page[0])

        item["review"] = review["reviewBody"].replace("\\n", "")
        item["title"] = review["name"]
        item["rating"] = review["reviewRating"]["ratingValue"]
        item["image_url"] = review["image"]
        item["hotel_type"] = review["itemReviewed"]["@type"]
        item["hotel_name"] = review["itemReviewed"]["name"]
        item["hotel_adress"] = review["itemReviewed"]["address"]
        try:
            item["published_date"] = review["datePublished"]
        except KeyError:

            item["published_date"] = next(iter(response.xpath(
                f"//div[contains(@id,'review_{review_id}')]/div/div/span[@class='ratingDate']/@title""").extract()),
                                          None)

        item["trip_type"] = next(iter(response.xpath("//div[contains(@class,"
                                                     "'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div"
                                                     "/div/div/div[contains(@class,'noRatings')]/text()").extract()),
                                 None)

        try:
            item["trip_date"] = next(iter(response.xpath("//div[contains(@class,"
                                                         "'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div["
                                                         "contains(@class,'prw_reviews_stay_date_hsx')]/text()").extract(

            )), None)

        except:

            item["trip_date"] = next(iter(response.xpath(
                "//div[contains(@id,'review_538163624')]/div/div/div[@data-prwidget-name='reviews_stay_date_hsx']/text()").extract()),
                                     None)

        # user_info = response.xpath("//div[contains(@class,'prw_reviews_resp_sur_h_featured_review')]/div/div/div/div/div[contains(@class,'prw_reviews_user_links_hs')]").extract()[0]
        # item["unstructured"] = user_info_splitter(user_info)

        yield item


## 4. Crawling

In [None]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(MySpider)
process.start()

2019-02-05 08:37:08 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2019-02-05 08:37:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o  27 Mar 2018), cryptography 2.2.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-02-05 08:37:08 [scrapy.crawler] INFO: Overridden settings: {'DOWNLOAD_DELAY': 2, 'FEED_FORMAT': 'json', 'FEED_URI': 'tripadvisorenfinal1.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


## 5. Importing and reading data scraped

If you've succeeded, you should see here a dataframe with 248 entries corresponding to the 248 reviews of the Center Parc you scraped. Congratulations ! 

In [1]:
import pandas as pd
dfjson2 = pd.read_json('tripadvisorenfinal.json')
#Previewing DF
dfjson2.to_csv('trip_advisor1.csv')
dfjson2.head()

Unnamed: 0,hotel_adress,hotel_name,hotel_type,image_url,locid,pid,published_date,rating,review,review_id,review_language,reviewer_id,title,trip_date,trip_type
0,"{'@type': 'PostalAddress', 'streetAddress': ''...",Center Parcs Longleat Forest,LodgingBusiness,https://media-cdn.tripadvisor.com/media/photo-...,523986,38673,"January 26, 2019",4,Great weekend break with 3 friends in an exclu...,648335275,en,0FE613A3F970F63FC8807F3CD41BF9CB,Fab villa.........security a bit lacking,January 2019,
1,"{'@type': 'PostalAddress', 'streetAddress': ''...",Center Parcs Longleat Forest,LodgingBusiness,https://media-cdn.tripadvisor.com/media/photo-...,523986,38673,"January 27, 2019",4,We\'ve been coming for the New Year celebratio...,648466584,en,ED7703F2E6551C6E08FA566331B1CE13,Still good but the last visit for us,December 2018,
2,"{'@type': 'PostalAddress', 'streetAddress': ''...",Center Parcs Longleat Forest,LodgingBusiness,https://media-cdn.tripadvisor.com/media/photo-...,523986,38673,"January 27, 2019",5,Long weekend here. First time I\u2019ve been t...,648468690,en,8087E71749FD08E4AA4A38F626396580,Friends birthday,January 2019,Traveled with friends
3,"{'@type': 'PostalAddress', 'streetAddress': ''...",Center Parcs Longleat Forest,LodgingBusiness,https://media-cdn.tripadvisor.com/media/photo-...,523986,38673,"January 27, 2019",4,We visited for 4 nights from Monday to Friday ...,648596082,en,0B87894C1BB125B19994E7F037CB63DF,Great family break,January 2019,Traveled with family
4,"{'@type': 'PostalAddress', 'streetAddress': ''...",Center Parcs Longleat Forest,LodgingBusiness,https://media-cdn.tripadvisor.com/media/photo-...,523986,38673,"January 28, 2019",4,I was a little dubious about going away in Jan...,648700282,en,2A078B56BD7FA857FF449C2BCB17FE61,Pricey but good,January 2019,


In [2]:
dfjson.info()

NameError: name 'dfjson' is not defined