In [10]:
import os
import queue
import threading
import time


class QueueThread(threading.Thread):
    def __init__(self, func, queue):
        super().__init__()
        self.func = func
        self.queue = queue

    def run(self):
        while True:
            try:
                self.func(*self.queue.get_nowait())
                self.queue.task_done()
            except queue.Empty:
                break

In [11]:
from PIL import Image
import imagehash

def compare_image(image1, image2, title):
    hash0 = imagehash.average_hash(Image.open(image1))
    hash1 = imagehash.average_hash(Image.open(image2))
    cutoff = 10 # maximum bits that could be different between the hashes.


    print (hash0, hash1, hash0 - hash1, title)
    return hash0 - hash1 < cutoff

In [12]:
import requests

def common_get(url):
    while True:
        try:
            response = requests.get(url)
            if response.status_code == requests.codes.too_many_requests:
                time.sleep(5)
                continue
            return response
        except requests.exceptions.ConnectionError:
            continue

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

threshold = 0.5

def compare_title(title1, title2, subtitle2):
    corpus = [title1,
              title2,
              f"{title2} {subtitle2}"]

    vect = TfidfVectorizer(min_df=1, stop_words=None)
    tfidf = vect.fit_transform(corpus)
    similarity = max((tfidf * tfidf.T).A[0, 1:])
    return similarity

compare_title(
    "Starving Hearts (Triangular Trade Trilogy, #1)",
    "Starving Hearts",
    ""
)

0.4343672818844282

In [15]:
import logging, sys, os, time

def get_logger_handler(log_path=None, is_print=False, level=logging.DEBUG):
    handlers = list()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    if is_print:
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(level)
        handler.setFormatter(formatter)
        handlers.append(handler)

    if log_path:
        handler = logging.FileHandler(log_path, encoding="utf-8")
        handler.setLevel(level)
        handler.setFormatter(formatter)
        handlers.append(handler)

    return handlers

logger = logging.Logger("Book Scrape", level=logging.INFO)
os.makedirs("./log", exist_ok=True)
logger.handlers = get_logger_handler(f"./log/{time.strftime('%Y-%m-%d %H-%M-%S', time.localtime())}.log")

In [16]:
import pandas
import requests
from bs4 import BeautifulSoup
import math
import tqdm
import urllib
import json

headers = [
    "Title of Book",
    "Number of reviews",
    "Rating",
    "Description",
    "Price",
    "Product Type",
    "Book category",
    "Availability",
    "Author"
]

book_info_list = list()

response = requests.get("http://books.toscrape.com/index.html")

soup = BeautifulSoup(response.content, "html.parser")

# total, _, step = [int(item.string) for item in soup.find("form", "form-horizontal", method="get").find_all("strong")]

total = 200
step = 20

print("total =", total, step)

pbar = tqdm.tqdm(range(total), position=0, leave=True)

def get_book_list(page):

    response = common_get(f"http://books.toscrape.com/catalogue/page-{page}.html")

    soup = BeautifulSoup(response.content, "html.parser")
    urls = [item.a["href"] for item in soup.findAll("article", "product_pod")]
    book_queue = queue.Queue()

    for url in urls:
        book_queue.put((url, ))

    for _ in range(20):
        QueueThread(get_title, book_queue).start()
    book_queue.join()

rating_str_to_num = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
}

os.makedirs("./img", exist_ok=True)

def get_title(url):

    response = common_get(f"http://books.toscrape.com/catalogue/{url}")

    soup = BeautifulSoup(response.content, "html.parser")

    title = soup.find("div", "product_main").h1.string

    book_info = [
        title,
        int(soup.find("table", "table table-striped").find("th", text="Number of reviews").find_next_sibling("td").string),
        rating_str_to_num[soup.find("div", "product_main").find("p", "star-rating")["class"][1].lower()],
        soup.find("div", id="product_description").find_next("p").string if soup.find("div", id="product_description") else None,
        soup.find("table", "table table-striped").find("th", text="Price (excl. tax)").find_next_sibling("td").string,
        soup.find("table", "table table-striped").find("th", text="Product Type").find_next_sibling("td").string,
        soup.find("ul", "breadcrumb").find("li", "active").find_previous_sibling("li").a.string,
        soup.find("table", "table table-striped").find("th", text="Availability").find_next_sibling("td").string,
    ]

    image_url = f'http://books.toscrape.com/{soup.find("div", "item active").find("img")["src"].replace("../", "")}'
    image_path = f'./img/{urllib.parse.quote(title)[:30]}{os.path.splitext(image_url)[1]}'

    # urllib.request.urlretrieve(image_url, image_path)

    google_url = f'https://www.googleapis.com/books/v1/volumes?q={urllib.parse.quote(title, "/(),")}'
    response = common_get(google_url)

    google_books = json.loads(response.content)

    if 'items' in google_books.keys():
        google_books = google_books['items']
    else:
        print(google_books)

    author = None

    for book in google_books[0:3]:

        google_title = book["volumeInfo"]["title"]
        subtitle = book["volumeInfo"]["subtitle"] if "subtitle" in book["volumeInfo"] else ""

        similarity = compare_title(title, google_title, subtitle)
        logger.info(f'\n{google_url}\n{similarity}\n{title}\n{google_title} {subtitle}\n')

        if compare_title(title, google_title, subtitle):
            if "authors" not in book["volumeInfo"].keys():
                continue
            author = ", ".join(book["volumeInfo"]["authors"])
            if 'imageLinks' in book['volumeInfo'].keys():
                cover_image_url = book['volumeInfo']['imageLinks']['thumbnail']
                # urllib.request.urlretrieve(cover_image_url, f'./img/{urllib.parse.quote(title)[:30]}-google{os.path.splitext(image_url)[1]}')
            break

    book_info.append(author)

    # List append is thread-safe.
    book_info_list.append(book_info)
    pbar.update(1)

list_queue = queue.Queue()

for idx in range(1, math.ceil(total / step) + 1):
    list_queue.put((idx, ))

for _ in range(25):
    QueueThread(get_book_list, list_queue).start()
list_queue.join()

pandas.DataFrame(book_info_list, columns=headers).to_csv("./result.csv", sep='\t')

print("Done")

total = 200 20


  0%|          | 0/200 [11:56<?, ?it/s]
100%|██████████| 200/200 [01:18<00:00,  7.61s/it]