In [1]:
import json
import re
import time
import random

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from utils.paths import URL_CONFIG_PATH, BOOKS_PATH, AUTHORS_PATH
from utils.enums import URLS, SOUP
from utils.regex import NUMBER_PATTERN, LITERAL_PATTERN

from scraping.pages import scrape_books
from scraping.session import get_session

In [3]:
with open(URL_CONFIG_PATH) as url_config_file:
    urls = json.load(url_config_file)

In [4]:
session = get_session()

In [5]:
# utils
def get_with_retry(session, url, retries=5):
    for i in range(retries):
        try:
            html = session.get(url).text
            break
        except Exception as e:
            print(f'Error: {e}')
            time.sleep(.5)
    return html

In [None]:
# Initialize lists to store book and author data
books_data_dict_list = []
authors_data_dict_list = []

# give visited_books
if BOOKS_PATH.exists():
    visited_books = set(pd.read_csv(BOOKS_PATH)['url'].unique())
else:
    visited_books = set()

# give visited_authors
if AUTHORS_PATH.exists():
    visited_authors = set(pd.read_csv(AUTHORS_PATH)['author_url'].unique())
else:
    visited_authors = set()

# Loop through pages to scrape book URLs
try:
    for step in range(1, 344):
        books_urls = scrape_books(base_url=urls[URLS.BASE], page_url=urls[URLS.PAGE].format(step=step), session=session)
        
        # Loop through each book URL to scrape book details
        for book_url in books_urls:
            # book_html = session.get(book_url).text
            book_html = get_with_retry(session, book_url)
            soup_book = BeautifulSoup(book_html, SOUP.HTML_PARSER)
            

            # Extract author names and hrefs
            authors_html = soup_book.find_all('a', class_='link-name d-inline-block')
            authors_names = [author.text for author in authors_html]
            authors_hrefs = [author['href'] for author in authors_html]

            # Scrape author
            author_href = authors_hrefs[0]
            if author_href not in visited_authors:
                author_html = get_with_retry(session, author_href)
                author_soup = BeautifulSoup(author_html, SOUP.HTML_PARSER)
                author_author_name = author_soup.find('div', class_='author-main__header-wrapper').text
                author_average_rating = float(author_soup.find('div', class_='author-box').find('span', class_='rating__avarage').text.replace(',', '.'))
                author_number_of_people_read = int(author_soup.find('div', class_='author-box__readers-col').find('span').text.replace(' ', ''))
                author_number_of_people_read, number_of_people_wants_to_read = [int(element.find('span').text.replace(' ', '')) for element in author_soup.find_all('div', class_='author-box__readers-col')]
                try:
                    author_date_of_birth = pd.to_datetime(author_soup.find('span', class_='author-info__born').text.split()[-1], format='%d.%m.%Y')
                except:
                    author_date_of_birth = author_soup.find('span', class_='author-info__born').text.split()[-1]
                author_number_of_fans = int(author_soup.find('span', class_='author-box__number').text.replace(' ', ''))
                author_number_of_books_written = int(author_soup.find('div', class_='author-info__count').text)
                author_awards_html = author_soup.find('div', class_='author-info__count author-info__count--awards')
                if author_awards_html is not None:
                    author_number_of_awards = int(author_awards_html.text)
                else:
                    author_number_of_awards = 0
                author_data_dict = {
                    'author_name': author_author_name,
                    'author_url': author_href,
                    'author_average_rating': author_average_rating,
                    'author_number_of_people_read': author_number_of_people_read,
                    'author_number_of_people_wants_to_read': number_of_people_wants_to_read,
                    'author_date_of_birth': author_date_of_birth,
                    'author_number_of_fans': author_number_of_fans,
                    'author_number_of_books_written': author_number_of_books_written,
                    'author_number_of_awards': author_number_of_awards
                }
                authors_data_dict_list.append(author_data_dict)
                visited_authors.add(author_href)
            # Scrape author end

            # scrape publisher
            ...
            # scrape publisher end
            if book_url in visited_books:
                continue
            # Create a dictionary for authors
            authors = {}
            for index, (author_name, author_href) in enumerate(zip(authors_names, authors_hrefs)):
                number = index if index > 0 else ''
                authors[f'author{number}'] = author_name
                authors[f'author_href{number}'] = author_href
            
            # Extract book details
            pages_html = soup_book.find('span', class_='d-sm-inline-block book-pages book__pages pr-2 mr-2 pr-sm-3 mr-sm-3')
            description_html = soup_book.find('div', class_='collapse-content')
            description = description_html.text if description_html else ''
            
            # Extract user statistics
            user_stats_html = soup_book.find('div', class_='d-flex flex-wrap justify-content-around px-3')
            if user_stats_html is None:
                number_of_discussions = 0
                number_of_user_opinions = 0
                number_of_user_ratings = 0
            else:
                user_stats = user_stats_html.text
                user_stats = list(map(int, re.findall(NUMBER_PATTERN, user_stats)))
            
                if len(user_stats) == 2:
                    number_of_user_opinions, number_of_user_ratings = user_stats
                    number_of_discussions = 0
                elif len(user_stats) == 3:
                    number_of_user_opinions, number_of_user_ratings, number_of_discussions = user_stats
            
            # Extract additional book details
            details_dict = dict(zip(
                [element.text.strip().rstrip(':') for element in soup_book.find_all('dt')],
                [element.text.strip() for element in soup_book.find_all('dd')]
            ))
            
            # Extract on-the-shelf statistics
            on_the_shelf_dict_raw = {
                re.search(LITERAL_PATTERN, element.text).group().strip(): "".join(re.findall(NUMBER_PATTERN, element.text))
                for element in soup_book.find_all('li', class_='list-group-item p-0')
            }
            on_the_shelf_dict = {
                'number_of_people_read': on_the_shelf_dict_raw.get('Przeczytane', np.nan),
                'number_of_people_has': on_the_shelf_dict_raw.get('Posiadam', np.nan),
                'number_of_people_favorite': on_the_shelf_dict_raw.get('Ulubione', np.nan),
                'number_of_people_wants_to_read': on_the_shelf_dict_raw.get('Chcę przeczytać', np.nan),
                'number_of_people_wants_as_gift': on_the_shelf_dict_raw.get('Chcę w prezencie', np.nan),
                'number_of_people_currently_read': on_the_shelf_dict_raw.get('Teraz czytam', np.nan)
            }
            
            # Extract tags
            tags = '&'.join([element.text.strip() for element in soup_book.find_all('a', class_='tag')])
            
            # Extract ratings
            ratings_dict = {
                f'rating_{element["data-rating"]}': int("".join(re.findall(NUMBER_PATTERN, element.text.strip())))
                for element in soup_book.find_all('a', class_='chart-valuebtn btn-link--without-bold plusCountModal')
            }
            
            # Combine all extracted data into a single dictionary
            books_data_dict = {
                **authors,
                'description': description,
                'number_of_user_opinions': number_of_user_opinions,
                'number_of_user_ratings': number_of_user_ratings,
                'number_of_discussions': number_of_discussions,
                **details_dict,
                **on_the_shelf_dict,
                'tags': tags,
                **ratings_dict,
                'url': book_url
            }
            
            # Append the book data dictionary to the list
            books_data_dict_list.append(books_data_dict)
            
            # Sleep for a random time between requests to avoid being blocked
            random_sleep_time = random.uniform(0.5, 1.5)
            time.sleep(random_sleep_time)
            # print current step
        print(f'Step: {step}')
except Exception as e:
    print(f'Error: {e}')
    print('something went wrong')
    # save data frames as tmp data frames to be merged later
    books_df_tmp = pd.DataFrame(books_data_dict_list)
    authors_df_tmp = pd.DataFrame(authors_data_dict_list)
    books_df_tmp.to_csv('books_tmp.csv', index=False)
    authors_df_tmp.to_csv('authors_tmp.csv', index=False)
    print('tmp data frames saved')


In [15]:
books_df = pd.DataFrame(books_data_dict_list)
authors_df = pd.DataFrame(authors_data_dict_list)

In [16]:
authors_df

Unnamed: 0,author_name,author_url,author_average_rating,author_number_of_people_read,author_number_of_people_wants_to_read,author_date_of_birth,author_number_of_fans,author_number_of_books_written,author_number_of_awards
0,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,7.6,130174,75668,1965-07-31,11479,40,1
1,Antoine de Saint-Exupéry,https://lubimyczytac.pl/autor/14276/antoine-de...,7.5,87296,8410,1900-06-29,681,42,0
2,J.R.R. Tolkien,https://lubimyczytac.pl/autor/3216/j-r-r-tolkien,8.2,95479,78821,1892-01-03,8866,97,0
3,George Orwell,https://lubimyczytac.pl/autor/10474/george-orwell,7.4,84640,44871,1903-06-25,2086,48,1
4,Michaił Bułhakow,https://lubimyczytac.pl/autor/22701/michail-bu...,7.3,66721,30123,1891-05-15,1344,48,0
5,Andrzej Sapkowski,https://lubimyczytac.pl/autor/3291/andrzej-sap...,7.6,90948,91278,1948-06-21,10281,67,0
6,Carlos Ruiz Zafón,https://lubimyczytac.pl/autor/24023/carlos-rui...,7.5,71370,78018,1964-09-25,6819,18,1
7,Stephenie Meyer,https://lubimyczytac.pl/autor/14755/stephenie-...,6.5,77442,31006,1973-12-24,2756,34,0
8,Suzanne Collins,https://lubimyczytac.pl/autor/6087/suzanne-col...,8.0,65485,33186,1962-08-10,3808,11,1
9,Stieg Larsson,https://lubimyczytac.pl/autor/19023/stieg-larsson,8.1,57785,37234,1954-08-15,3799,10,0


In [102]:
author_href = books_df['author_href'][0]

In [185]:
# scape information about authors with retry
author_html = get_with_retry(session, author_href)
authour_soup = BeautifulSoup(author_html, SOUP.HTML_PARSER)
author_author_name = authour_soup.find('div', class_='author-main__header-wrapper').text
author_average_rating = float(authour_soup.find('div', class_='author-box').find('span', class_='rating__avarage').text.replace(',', '.'))
author_number_of_people_read = int(authour_soup.find('div', class_='author-box__readers-col').find('span').text.replace(' ', ''))
author_number_of_people_read, number_of_people_wants_to_read = [int(element.find('span').text.replace(' ', '')) for element in authour_soup.find_all('div', class_='author-box__readers-col')]
author_date_of_birth = pd.to_datetime(authour_soup.find('span', class_='author-info__born').text.split()[-1], format='%d.%m.%Y')
author_number_of_fans = int(authour_soup.find('span', class_='author-box__number').text.replace(' ', ''))
author_number_of_books_written = int(authour_soup.find('div', class_='author-info__count').text)
author_awards_html = authour_soup.find('div', class_='author-info__count author-info__count--awards')
if author_awards_html is not None:
    author_number_of_awards = int(author_awards_html.text)
else:
    author_number_of_awards = 0

In [186]:
author_number_of_awards

1

In [7]:
books_df = pd.DataFrame(books_data_dict_list)

In [None]:
# books_df.to_csv(BOOKS_PATH, index=False)