In [1]:
import json
import re
import time
import random

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from utils.paths import URL_CONFIG_PATH, BOOKS_PATH
from utils.enums import URLS, SOUP

from scraping.pages import scrape_books
from scraping.session import get_session

In [5]:
if BOOKS_PATH.exists():
    visited_books = set(pd.read_csv(BOOKS_PATH)['author_href'].unique())

In [6]:
with open(URL_CONFIG_PATH) as url_config_file:
    urls = json.load(url_config_file)

In [7]:
session = get_session()

In [8]:
number_pattern = re.compile(r'\d+')
literal_pattern = re.compile(r'\D+')

In [6]:
books_data_dict_list = []
for step in range(1, 2):
    books_urls = scrape_books(base_url=urls[URLS.BASE], page_url=urls[URLS.PAGE].format(step=step), session=session)
    for book_url in books_urls:
        book_html = session.get(book_url).text
        # author name and href to author page
        soup_book = BeautifulSoup(book_html, SOUP.HTML_PARSER)
        authors_html = soup_book.find_all('a', class_='link-name d-inline-block')
        authors_names = [author.text for author in authors_html]
        authors_hrefs = [author['href'] for author in authors_html]
        authors = {}
        for index, (author_name, author_href) in enumerate(zip(authors_names, authors_hrefs)):
            number = index if index > 0 else ''
            authors[f'author{number}'] = author_name
            authors[f'author_href{number}'] = author_href
        pages_html = soup_book.find('span', class_='d-sm-inline-block book-pages book__pages pr-2 mr-2 pr-sm-3 mr-sm-3')
        # pages = int(re.search(number_pattern, pages_html.text).group())
        description_html = soup_book.find('div', class_='collapse-content')
        description = description_html.text
        user_stats_html = soup_book.find('div', class_='d-flex flex-wrap justify-content-around px-3')
        user_stats = user_stats_html.text
        user_stats = list(map(int, re.findall(number_pattern, user_stats)))
        if len(user_stats) == 2:
            number_of_user_opinions, number_of_user_ratings = user_stats
            number_of_discussions = 0
        elif len(user_stats) == 3:
            number_of_user_opinions, number_of_user_ratings, number_of_discussions = user_stats
        detitails_dict = dict(zip([element.text.strip().rstrip(':') for element in soup_book.find_all('dt')], [element.text.strip() for element in soup_book.find_all('dd')]))
        on_the_shelf_dict_raw = {
            re.search(literal_pattern, element.text).group().strip(): "".join(re.findall(number_pattern, element.text))
            for element in soup_book.find_all('li', class_='list-group-item p-0')
            }
        on_the_shelf_dict = {
            'number_of_people_read': on_the_shelf_dict_raw.get('Przeczytane', np.nan),
            'number_of_people_has': on_the_shelf_dict_raw.get('Posiadam', np.nan),
            'number_of_people_favorite': on_the_shelf_dict_raw.get('Ulubione', np.nan),
            'number_of_people_wants_to_read': on_the_shelf_dict_raw.get('Chcę przeczytać', np.nan),
            'number_of_people_wants_as_gift': on_the_shelf_dict_raw.get('Chcę w prezencie', np.nan),
            'number_of_people_currently_read': on_the_shelf_dict_raw.get('Teraz czytam', np.nan)
        }
        tags = '&'.join([element.text.strip() for element in soup_book.find_all('a', class_='tag')])
        ratings_dict = {
            f'rating_{element["data-rating"]}': int("".join(re.findall(number_pattern, element.text.strip())))
            for element in soup_book.find_all('a', class_='chart-valuebtn btn-link--without-bold plusCountModal')
            }
        books_data_dict = {
            **authors,
            'description': description,
            'number_of_user_opinions': number_of_user_opinions,
            'number_of_user_ratings': number_of_user_ratings,
            'number_of_discussions': number_of_discussions,
            **detitails_dict,
            **on_the_shelf_dict,
            'tags': tags,
            **ratings_dict
        }
        books_data_dict_list.append(books_data_dict)
        random_sleep_time = random.uniform(0.5, 1.5)
        time.sleep(random_sleep_time)

In [7]:
books_df = pd.DataFrame(books_data_dict_list)

In [10]:
books_df['author_href']

0        https://lubimyczytac.pl/autor/3701/j-k-rowling
1        https://lubimyczytac.pl/autor/3701/j-k-rowling
2        https://lubimyczytac.pl/autor/3701/j-k-rowling
3        https://lubimyczytac.pl/autor/3701/j-k-rowling
4     https://lubimyczytac.pl/autor/14276/antoine-de...
5        https://lubimyczytac.pl/autor/3701/j-k-rowling
6        https://lubimyczytac.pl/autor/3701/j-k-rowling
7        https://lubimyczytac.pl/autor/3701/j-k-rowling
8      https://lubimyczytac.pl/autor/3216/j-r-r-tolkien
9     https://lubimyczytac.pl/autor/10474/george-orwell
10    https://lubimyczytac.pl/autor/22701/michail-bu...
11    https://lubimyczytac.pl/autor/3291/andrzej-sap...
12    https://lubimyczytac.pl/autor/24023/carlos-rui...
13    https://lubimyczytac.pl/autor/14755/stephenie-...
14    https://lubimyczytac.pl/autor/6087/suzanne-col...
15    https://lubimyczytac.pl/autor/3291/andrzej-sap...
16    https://lubimyczytac.pl/autor/19023/stieg-larsson
17        https://lubimyczytac.pl/autor/19390/da