In [44]:
import bs4 as bs
import requests
import regex as re
import pandas as pd
# TODO keep in mind that when you move your code to file you can just import stuff
# For example
from config import *
from scraping_utils import get_category_books_urls

Constants

In [45]:
URL_SOURCE = "https://www.goodreads.com/"
URL_START = "https://www.goodreads.com/list/tag/"
BOOK_CATEGORIES = [
    "romance",
    "fiction",
    "young-adult",
    "fantasy",
    "science-fiction",
    "non-fiction",
    "children",
    "history",
    "covers",
    "mystery",
    "horror",
    "best",
    "historical-fiction",
    "gay",
    "paranormal",
    "love",
    "titles",
    "contemporary",
    "middle-grade",
    "historical-romance",
    "biography",
    "thriller",
    "series",
    "women",
    "nonfiction",
    "classics",
    "lgbt",
    "graphic-novels",
    "memoir",
    "queer",
]

Auxiliary functions

In [46]:
def get_page_body(url: str):
    try:
        response = requests.get(url, timeout=10)
    except requests.exceptions.HTTPError as errh:
        print ("http error:",errh)
    except requests.exceptions.ConnectionError as errc:
        print ("conection error:",errc)
    except requests.exceptions.Timeout as errt:
        print ("timeout error:",errt)
    except requests.exceptions.RequestException as err:
        print ("other error:",err)
        
    if response.status_code == 200:
        page = bs.BeautifulSoup(response.text)
        return page.body
    else:
        return None
    
def clean_text(s: str):
    s = re.sub(r'[\n\t]', ' ', s)
    s = s.strip()
    s = ' '.join(s.split())
    return s

    
def get_category_urls(input_url: str = URL_START, top_n:int = 1) -> dict:
    category_urls = {}
    for category in BOOK_CATEGORIES:
        page_body = get_page_body(input_url + category)
        if page_body:
            link = page_body.find("div",{"class": "listImgs"}).find("a")['href']
            links = ["https://www.goodreads.com/" + link + f"?page={i}" for i in range(1,top_n+1)]
            category_urls[category] = links
    return category_urls


def get_separate_book_urls(url: str):
    page_body = get_page_body(url)
    urls = []
    if page_body:
        for section in page_body.find_all("a",{"class": "bookTitle"}):
            link = URL_SOURCE + section['href']
            urls.append(link)
    return urls


def get_text(x):
    return clean_text(getattr(x, "text", ""))


def get_book_info(url: str, book_category: str):
    page_body = get_page_body(url)
    book_info = {}
    if page_body:
        book_info["category"] = book_category
        book_info["title"] = get_text(page_body.find("h1", id = "bookTitle"))
        book_info["author"] = get_text(page_body.find("span", itemprop="name"))
        book_info["description"] = get_text(page_body.find("div", id="description"))
        book_info["rating"] =  get_text(page_body.find("span", itemprop="ratingValue"))
        book_info["number_of_pages"] = get_text(page_body.find("span", itemprop="numberOfPages"))
        book_info["url"] = url
    return book_info

Main scraping script

In [47]:
books_db = pd.DataFrame()
category_urls = get_category_urls()

for category in category_urls.keys():
    if category_urls[category]:
        for page_url in category_urls[category]:
            books_urls_list = get_separate_book_urls(page_url)
            if books_urls_list:
                for book_url in books_urls_list:
                    book_info = get_book_info(book_url, category)
                    if book_info:
                        books_db = books_db.append(book_info, ignore_index=True)
books_db.to_parquet("books_info.parquet")

In [None]:
df = pd.read_parquet("books_info.parquet")