In [161]:
import json
import re
import time
import random

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from utils.paths import URL_CONFIG_PATH
from utils.enums import URLS, SOUP

from scraping.pages import scrape_books
from scraping.session import get_session

In [8]:
with open(URL_CONFIG_PATH) as url_config_file:
    urls = json.load(url_config_file)

In [149]:
session = get_session()

In [150]:
number_pattern = re.compile(r'\d+')
literal_pattern = re.compile(r'\D+')

In [151]:
books_urls = scrape_books(base_url=urls[URLS.BASE], page_url=urls[URLS.PAGE].format(step=1), session=session)

In [152]:
books_urls

['https://lubimyczytac.pl/ksiazka/5103137/harry-potter-i-kamien-filozoficzny',
 'https://lubimyczytac.pl/ksiazka/5103141/harry-potter-i-komnata-tajemnic',
 'https://lubimyczytac.pl/ksiazka/5103140/harry-potter-i-wiezien-azkabanu',
 'https://lubimyczytac.pl/ksiazka/5103135/harry-potter-i-czara-ognia',
 'https://lubimyczytac.pl/ksiazka/5171240/maly-ksiaze',
 'https://lubimyczytac.pl/ksiazka/10829/harry-potter-i-zakon-feniksa',
 'https://lubimyczytac.pl/ksiazka/5103138/harry-potter-i-ksiaze-polkrwi',
 'https://lubimyczytac.pl/ksiazka/5103142/harry-potter-i-insygnia-smierci',
 'https://lubimyczytac.pl/ksiazka/5121473/hobbit-czyli-tam-i-z-powrotem',
 'https://lubimyczytac.pl/ksiazka/5131218/1984',
 'https://lubimyczytac.pl/ksiazka/5166174/mistrz-i-malgorzata',
 'https://lubimyczytac.pl/ksiazka/240310/ostatnie-zyczenie',
 'https://lubimyczytac.pl/ksiazka/4802748/cien-wiatru',
 'https://lubimyczytac.pl/ksiazka/5090872/zmierzch-wydanie-specjalne',
 'https://lubimyczytac.pl/ksiazka/5133188/igrz

In [176]:
books_data_dict_list = []
for step in range(1, 10):
    books_urls = scrape_books(base_url=urls[URLS.BASE], page_url=urls[URLS.PAGE].format(step=step), session=session)
    for book_url in books_urls:
        book_html = session.get(book_url).text
        # author name and href to author page
        soup_book = BeautifulSoup(book_html, SOUP.HTML_PARSER)
        authors_html = soup_book.find_all('a', class_='link-name d-inline-block')
        authors_names = [author.text for author in authors_html]
        authors_hrefs = [author['href'] for author in authors_html]
        authors = {}
        for index, (author_name, author_href) in enumerate(zip(authors_names, authors_hrefs)):
            number = index if index > 0 else ''
            authors[f'author{number}'] = author_name
            authors[f'author_href{number}'] = author_href
        pages_html = soup_book.find('span', class_='d-sm-inline-block book-pages book__pages pr-2 mr-2 pr-sm-3 mr-sm-3')
        # pages = int(re.search(number_pattern, pages_html.text).group())
        description_html = soup_book.find('div', class_='collapse-content')
        description = description_html.text
        user_stats_html = soup_book.find('div', class_='d-flex flex-wrap justify-content-around px-3')
        user_stats = user_stats_html.text
        user_stats = list(map(int, re.findall(number_pattern, user_stats)))
        if len(user_stats) == 2:
            number_of_user_opinions, number_of_user_ratings = user_stats
            number_of_discussions = 0
        elif len(user_stats) == 3:
            number_of_user_opinions, number_of_user_ratings, number_of_discussions = user_stats
        detitails_dict = dict(zip([element.text.strip().rstrip(':') for element in soup_book.find_all('dt')], [element.text.strip() for element in soup_book.find_all('dd')]))
        on_the_shelf_dict_raw = {
            re.search(literal_pattern, element.text).group().strip(): "".join(re.findall(number_pattern, element.text))
            for element in soup_book.find_all('li', class_='list-group-item p-0')
            }
        on_the_shelf_dict = {
            'number_of_people_read': on_the_shelf_dict_raw.get('Przeczytane', np.nan),
            'number_of_people_has': on_the_shelf_dict_raw.get('Posiadam', np.nan),
            'number_of_people_favorite': on_the_shelf_dict_raw.get('Ulubione', np.nan),
            'number_of_people_wants_to_read': on_the_shelf_dict_raw.get('Chcę przeczytać', np.nan),
            'number_of_people_wants_as_gift': on_the_shelf_dict_raw.get('Chcę w prezencie', np.nan),
            'number_of_people_currently_read': on_the_shelf_dict_raw.get('Teraz czytam', np.nan)
        }
        tags = '&'.join([element.text.strip() for element in soup_book.find_all('a', class_='tag')])
        ratings_dict = {
            f'rating_{element["data-rating"]}': int("".join(re.findall(number_pattern, element.text.strip())))
            for element in soup_book.find_all('a', class_='chart-valuebtn btn-link--without-bold plusCountModal')
            }
        books_data_dict = {
            **authors,
            'description': description,
            'number_of_user_opinions': number_of_user_opinions,
            'number_of_user_ratings': number_of_user_ratings,
            'number_of_discussions': number_of_discussions,
            **detitails_dict,
            **on_the_shelf_dict,
            'tags': tags,
            **ratings_dict
        }
        books_data_dict_list.append(books_data_dict)
        random_sleep_time = random.uniform(0.5, 1.5)
        time.sleep(random_sleep_time)

In [179]:
books_df = pd.DataFrame(books_data_dict_list)

In [180]:
books_df.to_csv('books_data_prototype.csv', index=False)

In [181]:
books_df

Unnamed: 0,author,author_href,description,number_of_user_opinions,number_of_user_ratings,number_of_discussions,Kategoria,Format,Cykl,Tytuł oryginału,...,rating_4,rating_3,rating_2,rating_1,Inne,author1,author_href1,Seria,author2,author_href2
0,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,"Pierwszy tom cyklu ""Harry Potter"" w poważnej,...",5156,85006,1,literatura młodzieżowa,papier,Harry Potter (tom 1),Harry Potter and the Philosopher's Stone,...,501,490,67,145,,,,,,
1,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,"Drugi tom bestsellerowego cyklu w poważnej, ""...",2305,70814,0,literatura młodzieżowa,papier,Harry Potter (tom 2),Harry Potter and the Chamber of Secrets,...,345,404,26,67,,,,,,
2,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,"Trzeci tom bestsellerowego cyklu w poważnej, ...",2240,67913,0,literatura młodzieżowa,papier,Harry Potter (tom 3),Harry Potter and the prisoner of Azkaban,...,200,235,20,46,,,,,,
3,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,"Czwarty tom bestsellerowego cyklu w poważnej,...",1892,64433,0,literatura młodzieżowa,papier,Harry Potter (tom 4),Harry Potter and the Goblet of Fire,...,244,274,27,55,,,,,,
4,Antoine de Saint-Exupéry,https://lubimyczytac.pl/autor/14276/antoine-de...,Ekskluzywne wydanie z barwionymi brzegami.Poe...,4264,63435,0,literatura piękna,papier,,Le Petit Prince,...,1039,1139,335,428,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,J.K. Rowling,https://lubimyczytac.pl/autor/3701/j-k-rowling,"„Baśnie barda Beedle’a” to zbiór pełen magii,...",902,10789,0,literatura młodzieżowa,papier,,The Tales of Beedle the Bard,...,147,121,15,14,,,,Harry Potter,,
266,Rick Riordan,https://lubimyczytac.pl/autor/25135/rick-riordan,"W porywającym, dowcipnym i cieszącym się ogro...",628,10821,0,literatura młodzieżowa,papier,Percy Jackson i Bogowie Olimpijscy (tom 2),The Sea of Monsters,...,63,30,11,6,,,,,,
267,Molier,https://lubimyczytac.pl/autor/22970/molier,Klasyka literatury polskiej i światowej w sta...,311,10632,0,"utwór dramatyczny (dramat, komedia, tragedia)",papier,,Tartuffe,...,517,587,66,128,,,,Klasyka Literatury SBM [bez opracowania],,
268,Sarah J. Maas,https://lubimyczytac.pl/autor/74122/sarah-j-maas,Między światłem a ciemnością rozgrywa się wal...,1154,11091,0,romantasy,papier,Dwór cierni i róż (tom 2),A Court of Mist and Fury,...,88,48,44,27,#0 Top 100 1 nagroda,,,,,
