In [148]:
import json
import re
import time
import random

import pandas as pd
from bs4 import BeautifulSoup
import requests

from utils.paths import URL_CONFIG_PATH
from utils.enums import URLS, SOUP

from scraping.pages import scrape_books
from scraping.session import get_session

In [8]:
with open(URL_CONFIG_PATH) as url_config_file:
    urls = json.load(url_config_file)

In [9]:
session = get_session()

In [None]:
number_pattern = re.compile(r'\d+')
literal_pattern = re.compile(r'\D+')

In [10]:
books_urls = scrape_books(base_url=urls[URLS.BASE], page_url=urls[URLS.PAGE].format(step=1), session=session)

In [11]:
books_urls

['https://lubimyczytac.pl/ksiazka/5103137/harry-potter-i-kamien-filozoficzny',
 'https://lubimyczytac.pl/ksiazka/5103141/harry-potter-i-komnata-tajemnic',
 'https://lubimyczytac.pl/ksiazka/5103140/harry-potter-i-wiezien-azkabanu',
 'https://lubimyczytac.pl/ksiazka/5103135/harry-potter-i-czara-ognia',
 'https://lubimyczytac.pl/ksiazka/5171240/maly-ksiaze',
 'https://lubimyczytac.pl/ksiazka/10829/harry-potter-i-zakon-feniksa',
 'https://lubimyczytac.pl/ksiazka/5103138/harry-potter-i-ksiaze-polkrwi',
 'https://lubimyczytac.pl/ksiazka/5103142/harry-potter-i-insygnia-smierci',
 'https://lubimyczytac.pl/ksiazka/5121473/hobbit-czyli-tam-i-z-powrotem',
 'https://lubimyczytac.pl/ksiazka/5131218/1984',
 'https://lubimyczytac.pl/ksiazka/5166174/mistrz-i-malgorzata',
 'https://lubimyczytac.pl/ksiazka/240310/ostatnie-zyczenie',
 'https://lubimyczytac.pl/ksiazka/4802748/cien-wiatru',
 'https://lubimyczytac.pl/ksiazka/5090872/zmierzch-wydanie-specjalne',
 'https://lubimyczytac.pl/ksiazka/5133188/igrz

In [13]:
first_book = books_urls[0]

In [16]:
book_html = session.get(first_book).text

In [138]:
books_data_dict_list = []
for book_url in books_urls:
    book_html = session.get(book_url).text
    # author name and href to author page
    soup_book = BeautifulSoup(book_html, SOUP.HTML_PARSER)
    authors_html = soup_book.find_all('a', class_='link-name d-inline-block')
    authors_names = [author.text for author in authors_html]
    authors_hrefs = [author['href'] for author in authors_html]
    authors = {}
    for index, (author_name, author_href) in enumerate(zip(authors_names, authors_hrefs)):
        number = index if index > 0 else ''
        authors[f'author{number}'] = author_name
        authors[f'author_href{number}'] = author_href
    pages_html = soup_book.find('span', class_='d-sm-inline-block book-pages book__pages pr-2 mr-2 pr-sm-3 mr-sm-3')
    pages = int(re.search(number_pattern, pages_html.text).group())
    description_html = soup_book.find('div', class_='collapse-content')
    description = description_html.text
    user_stats_html = soup_book.find('div', class_='d-flex flex-wrap justify-content-around px-3')
    user_stats = user_stats_html.text
    number_of_user_opinions, number_of_user_ratings, number_of_discussions = list(map(int, re.findall(number_pattern, user_stats)))
    detitails_dict = dict(zip([element.text.strip() for element in soup_book.find_all('dt')], [element.text.strip() for element in soup_book.find_all('dd')]))
    on_the_shelf_dict_raw = {
        re.search(literal_pattern, element.text).group().strip(): "".join(re.findall(number_pattern, element.text))
        for element in soup_book.find_all('li', class_='list-group-item p-0')
        }
    on_the_shelf_dict = {
        'number_of_people_read': on_the_shelf_dict_raw['Przeczytane'],
        'number_of_people_has': on_the_shelf_dict_raw['Posiadam'],
        'number_of_people_favorite': on_the_shelf_dict_raw['Ulubione'],
        'number_of_people_wants_to_read': on_the_shelf_dict_raw['Chcę przeczytać'],
        'number_of_people_wants_as_gift': on_the_shelf_dict_raw['Chcę w prezencie'],
        'number_of_people_currently_read': on_the_shelf_dict_raw['Teraz czytam']
    }
    tags = '&'.join([element.text.strip() for element in soup_book.find_all('a', class_='tag')])
    ratings_dict = {
        f'rating_{element["data-rating"]}': int("".join(re.findall(number_pattern, element.text.strip())))
        for element in soup_book.find_all('a', class_='chart-valuebtn btn-link--without-bold plusCountModal')
        }
    books_data_dict = {
        **authors,
        'pages': pages,
        'description': description,
        'number_of_user_opinions': number_of_user_opinions,
        'number_of_user_ratings': number_of_user_ratings,
        'number_of_discussions': number_of_discussions,
        **detitails_dict,
        **on_the_shelf_dict,
        'tags': tags,
        **ratings_dict
    }
    books_data_dict_list.append(books_data_dict)
    random_sleep_time = random.uniform(0.5, 1.5)
    time.sleep(random_sleep_time)

In [145]:
pd.DataFrame([books_data_dict])

Unnamed: 0,author,author_href,pages,description,number_of_user_opinions,number_of_user_ratings,number_of_discussions,Kategoria:,Format:,Cykl:,...,rating_10,rating_9,rating_8,rating_7,rating_6,rating_5,rating_4,rating_3,rating_2,rating_1
0,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,320,"Pierwszy tom cyklu ""Harry Potter"" w poważnej,...",5155,85005,1,literatura młodzieżowa,papier,Harry Potter (tom 1),...,20734,15550,21000,17708,6098,2712,501,490,67,145
