# DS-SF-38 | 04 | Databases and Scrapping | Codealong | Starter Code

## Part C | Scrapping and Amazon Product Reviews (cont.)

> ## We are now ready to extract the reviews offline and no longer need to query the Amazon website.

In [29]:
import os
import gzip
import json
import lxml.html
import dateutil

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

> ## Input

In [30]:
with gzip.open(os.path.join('..', 'datasets', 'dataset-04-reviews.json.gz'), 'rb') as f:
    pages = json.loads(f.read())

In [31]:
len(pages)

9

## First page

In [32]:
page = pages['1']['content']

In [33]:
page



In [34]:
document = lxml.html.fromstring(page)

In [35]:
type(document)

lxml.html.HtmlElement

(http://lxml.de/api/lxml.html-module.html#fromstring and http://lxml.de/api/lxml.html.HtmlElement-class.html)

> ## All reviews of a page

(http://lxml.de/api/lxml.etree._Element-class.html#xpath)

In [36]:
reviews = document.xpath('//*[@data-hook="review"]')

In [37]:
len(reviews)

10

## First review

In [38]:
review = reviews[0]

In [39]:
type(review)

lxml.html.HtmlElement

> ## id

(http://lxml.de/api/lxml.etree._Element-class.html#get)

In [40]:
review.get('id')

'R5VZ5ES1K12GK'

> # star rating

In [41]:
review.find('.//*[@data-hook="review-star-rating"]').get('class')

'a-icon a-icon-star a-star-5 review-rating'

(http://lxml.de/api/lxml.etree._Element-class.html#find)

(https://en.wikipedia.org/wiki/XPath)

> ## title

In [42]:
review.findtext('.//*[@data-hook="review-title"]')

'More comfortable than his traditional gold one'

(http://lxml.de/api/lxml.etree._Element-class.html#findtext)

> ## author

In [43]:
review.findtext('.//*[@data-hook="review-author"]/*[@data-hook="review-author"]')

'Amazon Customer'

> ## date

In [47]:
# TODO
review.findtext('.//*[@data-hook="review-date"]')

'on September 7, 2017'

> ## body

In [44]:
# TODO

> ## Output

In [45]:
df = pd.DataFrame(columns = ['date', 'id', 'author', 'title', 'body', 'star_rating'])

In [46]:
df

Unnamed: 0,date,id,author,title,body,star_rating


## Putting all of it together

(https://docs.python.org/2/howto/unicode.html and https://docs.python.org/2/library/stdtypes.html)

In [48]:
def date(node):
    date = review.findtext('.//*[@data-hook="review-date"]').replace('on ', '')
    return dateutil.parser.parse(date)

def id(node):
    return node.get('id')

def author(node):
    return node.findtext('.//*[@data-hook="review-author"]/*[@data-hook="review-author"]').encode('ascii', 'ignore')

def title(node):
    return node.findtext('.//*[@data-hook="review-title"]').encode('ascii', 'ignore')

def body(node):
    return node.findtext('.//*[@data-hook="review-body"]').encode('ascii', 'ignore')

def star_rating(node):
    node = node.find('.//*[@data-hook="review-star-rating"]')

    if node == None:
        return np.nan

    for star_rating in range(1, 6):
        if node.find_class('a-star-{:d}'.format(star_rating)):
             return star_rating

    return np.nan

(http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html)

In [49]:
for i in sorted(pages.keys(), key = lambda i: int(i)):
    page = pages[i]['content']
    document = lxml.html.fromstring(page)
    reviews = document.xpath('//*[@data-hook="review"]')

    for review in reviews:
        df = df.append({'date'  : date(review),
                        'id'    : id(review),
                        'author': author(review),
                        'title' : title(review),
                        'body'  : body(review),
                        'star_rating': star_rating(review)},
                       ignore_index = True)

In [24]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2017-09-07,R5VZ5ES1K12GK,Amazon Customer,More comfortable than his traditional gold one,My husband likes these. More comfortable than ...,5
1,2017-09-07,ROB5GWGD863UD,Vannie Woods,Great for my hubby,As a cabinet builder my hubby can't wear his w...,5
2,2017-09-07,R3G544CWPBQH9R,amy,Very comfy rings.,Very comfy rings. This is great option for peo...,5
3,2017-09-07,R3R5OQBD4J8M07,Amazon Customer,Five Stars,I ordered from local store for my husband and ...,5
4,2017-09-07,RCM64N5R62CAL,Babygotcheese,Five Stars,Perfect ring for active men,5
...,...,...,...,...,...,...
72,2017-06-22,R3FKE63ZFILHDP,Matthew DeLara,I love these rings!,I love these rings! I always end up having to ...,5
73,2017-06-22,R3703RZSG3WMA7,H ome,perfect fit!,"Because of my work, I usually do not take my r...",5
74,2017-06-18,R3RRFHEULQAIA5,Amazon Customer,Just what I needed.,Excellent product. True to size. Comfortable.,5
75,2017-06-17,R2G7PVVEZ5AFJT,Amazon Customer,Five Stars,My husband loved it! Wears in place of his wed...,5


In [25]:
df.shape

(77, 6)

In [26]:
df.to_csv(os.path.join('..', 'datasets', 'dataset-04-reviews.csv'), index = False)