# Amazon scraper
#### Description:
As we do not have informations about book publishing date, category to which is assigned, keywords that could describe a book we decided to scrape this information. We created an account on Amazon Associate - Product Advertising API just to be able to access the Amazon database even if it is regulated by number of request or a rate of requests. Product Advertising API does not provide explicitly categories and keywords (50% of responses for requests made contain html of the product website - we can get categories and keywords using that), so we have to scrape Amazon product website to obtain the remaining ~50% of information. 

Scraping Amazon website it is really hard since they got an anti-scraping system which rejects our request after some number of request. That makes this process not-easy-to-do task, as it is hard to make it automatic. So it is more or less iterative.

### Import necessary libraries

In [None]:
from amazon_scraper import AmazonScraper
import pandas as pd
import numpy as np
import csv
import re
import json
import urllib3
from bs4 import BeautifulSoup
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
!python --version

In [None]:
# Read API Credentials
with open('/home/adam/EPFL_courses/PAAPICredentials.csv', newline='') as csvfile:
    file = csv.reader(csvfile, delimiter='\t', quotechar='|')
    for ind, row in enumerate(file):
        if ind == 1:
            Access_Key, Secret_Key, user_id = row[0].split(',')

# Initialize AmazonScraper instance
amzn = AmazonScraper(Access_Key, Secret_Key, user_id, Region='US', MaxQPS=0.9)#, Timeout=5.0)

In [None]:
# Read file with books id and reviews number
books_df = pd.read_csv("number_of_reviews_per_book.csv")

In [None]:
# Test API
p = amzn.lookup(ItemId='0984588205')
p.product.author

In [None]:
# Data collector that uses API
http = urllib3.PoolManager(headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'})
# Set start and end index of book id in loaded previously file
start_iter, end_iter = 20000, 22000
iter_num = len(books_df['Book_id'].values[start_iter:end_iter])

for ind, asin in enumerate(books_df['Book_id'].values[start_iter:end_iter]):
    print(asin)
    # Try to extract as much as we can usign API
    try:
        p = amzn.lookup(ItemId=asin)
        try:
            author = p.product.author
        except:
            author = ""
        try:
            title = p.product.title
        except:
            title = ""
        try:
            brand = p.product.brand
        except:
            brand = ""
        try:
            release_date = p.product.release_date.isoformat()
        except:
            try:
                release_date = p.product.publication_date.isoformat()
            except:
                release_date = ""
        try:
            publisher = p.product.publisher
        except:
            publisher = ""
        try:
            categories = [cat_list for cat_list in [[cat.string for cat in cat_list.find_all("a")] 
                                                  for cat_list in p.soup.find_all("span", "zg_hrsr_ladder")]]
        except:
            r = http.request('GET', 'http://www.amazon.com/dp/' + asin)
            bs = BeautifulSoup(r.data, "lxml")
            try:
                categories = [cat_list for cat_list in [[cat.string for cat in cat_list.find_all("a")] 
                                                      for cat_list in bs.find_all("span", "zg_hrsr_ladder")]]

            except:
                categories = [] 
                            
        try:
            keywords = list(set(map(str.strip, map(str.lower, re.split('; |,|\*|/|;|-', p.soup.find(attrs={"name":"keywords"})['content'].replace(".", ""))))))
        except:
            try:
                keywords = list(set(map(str.strip, map(str.lower, re.split('; |,|\*|/|;|-', 
                                                    bs.find(attrs={"name":"keywords"})['content'].replace(".", ""))))))
            except:
                keywords = []    


        tmp = {'asin': asin,
               'author': author,
               'title': title,
               'brand': brand,
               'release_date': release_date,
               'publisher': publisher,
               'categories': categories,
               'keywords': keywords,
              }

        with open('product_categories.json', 'a') as fp:
            json.dump(tmp, fp)
            fp.write("\n")

    except:
        print("ASIN NOT FOUND")
    print("Processed {0}/{1} books".format(ind+1, iter_num))

In [None]:
# Amazon website scraper
http = urllib3.PoolManager(headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'})
timeout = 3
# It uses an helper file with records of products, rewrite those that are complete and try to scrape information for those that do not contain them 
with open('product_categories_2.json', 'r') as file:
    for line in file:
        tmp = eval(line)
        if tmp['categories'] == [] or tmp['keywords'] == []:
            print(tmp['asin'])
            flag = 1
            r = http.request('GET', 'http://www.amazon.com/dp/' + tmp['asin'])
            bs = BeautifulSoup(r.data, "lxml")
            while flag == 1:
                try:
                    tmp['categories'] = [cat_list for cat_list in [[cat.string for cat in cat_list.find_all("a")] 
                                                      for cat_list in bs.find_all("span", "zg_hrsr_ladder")]]

                    tmp['keywords'] = list(set(map(str.strip, map(str.lower, re.split('; |,|\*|/|;|-', 
                                                    bs.find(attrs={"name":"keywords"})['content'].replace(".", ""))))))
                    flag = 0
                    timeout = 3
                except:
                    timeout = timeout + 3
                    print("Error, can get connection, increasing timeout = {}".format(timeout))
                    time.sleep(timeout + np.random.random())
                    http = urllib3.PoolManager(headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
                                                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                                        'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br'})
                    flag = 1
            
            time.sleep(1 + np.random.random())
        with open('product_categories_enhanced.json', 'a') as fp:
            json.dump(tmp, fp)
            fp.write("\n")