In [8]:
import os
import queue
import threading
import time


class QueueThread(threading.Thread):
    def __init__(self, func, queue):
        super().__init__()
        self.func = func
        self.queue = queue
        self.daemon = True

    def run(self):
        while True:
            try:
                self.func(*self.queue.get_nowait())
            except queue.Empty:
                return
            self.queue.task_done()

In [9]:
import requests

def common_get(url):
    while True:
        try:
            response = requests.get(url, timeout=5)
            if response.status_code == requests.codes.too_many_requests:
                continue
            return response
        except requests.exceptions.ConnectionError as e:
            net_logger.info(e)
            continue
        except requests.exceptions.Timeout as e:
            net_logger.info(e)
            continue

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

threshold = 0.5

def compare_title(title1, title2, subtitle2):
    corpus = [title1,
              title2,
              f"{title2} {subtitle2}"]

    vect = TfidfVectorizer(min_df=1, stop_words=None)
    tfidf = vect.fit_transform(corpus)
    similarity = max((tfidf * tfidf.T).A[0, 1:])
    return similarity

compare_title(
    "Starving Hearts (Triangular Trade Trilogy, #1)",
    "Starving Hearts",
    ""
)

0.4343672818844282

In [11]:
import logging, sys, os, time

def get_logger_handler(log_path=None, is_print=False, level=logging.DEBUG):
    handlers = list()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    if is_print:
        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(level)
        handler.setFormatter(formatter)
        handlers.append(handler)

    if log_path:
        handler = logging.FileHandler(log_path, encoding="utf-8")
        handler.setLevel(level)
        handler.setFormatter(formatter)
        handlers.append(handler)

    return handlers

os.makedirs("./log", exist_ok=True)

book_logger = logging.Logger("Book Scrape", level=logging.INFO)
book_logger.handlers = get_logger_handler(f"./log/book {time.strftime('%Y-%m-%d %H-%M-%S', time.localtime())}.log")
net_logger = logging.Logger("Net", level=logging.INFO)
net_logger.handlers = get_logger_handler(f"./log/net {time.strftime('%Y-%m-%d %H-%M-%S', time.localtime())}.log")

In [12]:
import logging
import sys
import time


class DebugTimer:
    """A simple class to record runtime of some codes.

    Attributes:
        start_time: A double of the timer's start time.
        end_time: A double of the timer's end time.
        desc: A string of the description of target process.
        print_format: A string of format of print().
        logger: A logger to output the information.
    """
    start_time = 0
    end_time = 0
    desc = str()
    print_format = str()
    logger = logging.Logger("DebugTimer", level=logging.INFO)

    def __init__(self, desc="unknown process", is_print=True, logger=None):
        self.desc = desc
        self.print_format = "The duration of {} is: {} s.\n"
        self.logger.handlers = logger.handlers if logger else get_logger_handler()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.end()

    def start(self):
        """Start the timer.
        """
        self.start_time = time.perf_counter()

    def end(self):
        """End the timer.
        """
        self.end_time = time.perf_counter()
        self.logger.info(self.print_format.format(self.desc, self.end_time - self.start_time))

In [13]:
import pandas
import requests
from bs4 import BeautifulSoup
import math
import tqdm
import urllib
import json
import re

headers = [
    "Title of Book",
    "Number of reviews",
    "Rating",
    "Description",
    "Price",
    "Product Type",
    "Book category",
    "Availability",
    "Author"
]

book_info_list = list()

response = common_get("http://books.toscrape.com/index.html")

soup = BeautifulSoup(response.content, "html.parser")

total, _, step = [int(item.string) for item in soup.find("form", "form-horizontal", method="get").find_all("strong")]

# total = 20
# step = 20

print("total =", total, step)

def get_book_list(page, book_queue):

    response = common_get(f"http://books.toscrape.com/catalogue/page-{page}.html")

    soup = BeautifulSoup(response.content, "html.parser")
    urls = [item.a["href"] for item in soup.findAll("article", "product_pod")]

    for url in urls:
        book_queue.put((url, ))


rating_str_to_num = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
}

os.makedirs("./img", exist_ok=True)

def get_title(url):

    response = common_get(f"http://books.toscrape.com/catalogue/{url}")

    soup = BeautifulSoup(response.content, "html.parser")

    title = soup.find("div", "product_main").h1.string

    book_info = [
        title,
        int(soup.find("table", "table table-striped").find("th", text="Number of reviews").find_next_sibling("td").string),
        rating_str_to_num[soup.find("div", "product_main").find("p", "star-rating")["class"][1].lower()],
        soup.find("div", id="product_description").find_next("p").string if soup.find("div", id="product_description") else None,
        soup.find("table", "table table-striped").find("th", text="Price (excl. tax)").find_next_sibling("td").string,
        soup.find("table", "table table-striped").find("th", text="Product Type").find_next_sibling("td").string,
        soup.find("ul", "breadcrumb").find("li", "active").find_previous_sibling("li").a.string,
        soup.find("table", "table table-striped").find("th", text="Availability").find_next_sibling("td").string,
    ]

    title = re.sub(r"\(.*#.*\)", "", title)

    google_url = f'https://www.googleapis.com/books/v1/volumes?q={urllib.parse.quote(title, "/(),")}&langRestrict=en'
    with DebugTimer(title, logger=book_logger):
        response = common_get(google_url)

    google_books = json.loads(response.content)

    if response.status_code != 200:
        print(response.content)

    if 'items' in google_books.keys():
        google_books = google_books['items']
    else:
        print(google_books)
        google_books = []

    author = None

    for book in google_books[0:3]:
        if "title" not in book["volumeInfo"] and "selfLink" in book.keys():
            book = json.loads(common_get(book["selfLink"]).content)

        if "title" not in book["volumeInfo"]:
            print(book)
            continue

        google_title = book["volumeInfo"]["title"]
        subtitle = book["volumeInfo"]["subtitle"] if "subtitle" in book["volumeInfo"] else ""

        similarity = compare_title(title, google_title, subtitle)
        book_logger.info(f'\n{google_url}\n{similarity}\n{title}\n{google_title} {subtitle}\n')

        if similarity > threshold:
            if "authors" not in book["volumeInfo"].keys():
                continue
            author = ", ".join(book["volumeInfo"]["authors"])
            break

    book_info.append(author)

    # List append is thread-safe.
    book_info_list.append(book_info)
    pbar.update(1)

list_queue = queue.Queue()
book_queue = queue.Queue()

pbar = tqdm.tqdm(range(total), position=0, leave=True)

for idx in range(1, math.ceil(total / step) + 1):
    list_queue.put((idx, book_queue))

for _ in range(10):
    QueueThread(get_book_list, list_queue).start()
list_queue.join()

for _ in range(20):
    QueueThread(get_title, book_queue).start()
book_queue.join()

pandas.DataFrame(book_info_list, columns=headers).to_csv("./result.csv", sep='\t')

print("Done")

total = 1000 20


100%|██████████| 1000/1000 [44:13<00:00,  2.65s/it]
 86%|████████▋ | 865/1000 [07:36<01:26,  1.56it/s]

{'kind': 'books#volumes', 'totalItems': 0}


100%|██████████| 1000/1000 [09:22<00:00,  6.46s/it]

Done
