In [305]:
import pickle
import urllib.request
from collections import OrderedDict
from time import sleep

import bs4
import mechanicalsoup
from tqdm import tqdm

# from tqdm.auto import trange

# import sys

# TEST_BOOK_PAGE = 'https://link.springer.com/book/10.1007/978-1-4612-4360-1'
BASE_URL = "https://link.springer.com/"
START_URL = 'https://link.springer.com/search?facet-language="En"&facet-content-type="Book"&package=openaccess&showAll=false'
# SECOIND_URL = 'https://link.springer.com/search/page/2?package=openaccess&showAll=false&facet-language="En"&facet-content-type="Book"'
UNKNOWN_URL = 'https://link.springer.com/search/page/{}?package=openaccess&showAll=false&facet-language="En"&facet-content-type="Book"'
MAX_RANGE = 46
TITLE_SELECTOR_IN_BOOK_INFO = "#main-content > article.main-wrapper.main-wrapper--no-gradient.main-wrapper--dual-main > div > div > div.main-body__content > div > div > div:nth-child(1) > div.page-title > h1"
PDF_LINK_SELECTOR_IN_BOOK_INFO = "#main-content > article.main-wrapper.main-wrapper--no-gradient.main-wrapper--dual-main > div > div > div.cta-button-container.cta-button-container--stacked.u-pt-36 > div > div > a"
EPUB_LINK_SELECTOR_IN_BOOK_INFO = "#main-content > article.main-wrapper.main-wrapper--no-gradient.main-wrapper--dual-main > div > div > div.cta-button-container.cta-button-container--inline.cta-button-container--stacked.u-pt-36.test-download-book-separate-buttons > div:nth-child(2) > a"


FIRST_NEXT_PAGE_SELECTOR = (
    "#kb-nav--main > div.functions-bar.functions-bar-bottom > form > a"
)
NEXT_PAGE_SELECTOR = (
    "#kb-nav--main > div.functions-bar.functions-bar-bottom > form > a.next"
)


class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


class Scraper:
    def __init__(self, url, data_directory="./../pdfs/", sleep_time=10):
        self.start_url = url
        self.detail_page_urls = []
        self.pdf_urls = []
        self.top_page_urls = [url]
        self.data_directory = data_directory
        self.browser = mechanicalsoup.StatefulBrowser()
        self.file_detail_d = OrderedDict()
        self.sleep_time = sleep_time
        self.debug = ""
        self.count = 1

    def collect_detail_page_urls(self, url=""):
        print(len(self.detail_page_urls))

        if not url:
            url = self.start_url
        result_list_page_bs = self.get_page_information(url)
        self.detail_page_urls += self.get_detail_page_urls(result_list_page_bs)

        if self.count < 46:
            self.count += 1

            self.top_page_urls.append(UNKNOWN_URL.format(self.count))
            # if self.get_next_page_link(result_list_page_bs):
            #     next_page_url = self.get_next_page_link(result_list_page_bs)
            #     self.top_page_urls.append(next_page_url)
            self.collect_detail_page_urls(UNKNOWN_URL.format(self.count))
        #     # sys.exit()
        # else:
        #     print("Finish")
        #     print(len(self.detail_page_urls))

    def collect_pdf_file_urls(self):
        for detail_page_link in tqdm(self.detail_page_urls):
            detail_page_bs = self.get_page_information(detail_page_link)

            if detail_page_bs.select(EPUB_LINK_SELECTOR_IN_BOOK_INFO):
                self.file_detail_d.update(
                    {
                        self.get_epub_name(
                            detail_page_bs
                        ): self.get_epub_url_link(detail_page_bs),
                    }
                )
            else:
                self.file_detail_d.update(
                    {
                        self.get_pdf_name(
                            detail_page_bs
                        ): self.get_pdf_url_link(detail_page_bs),
                    }
                )

    def download_pdfs(self):
        for output_path, url in tqdm(
            self.file_detail_d.items(), desc="file-loop"
        ):
            sleep(self.sleep_time)
            self.download_url(url, output_path)

    def download_url(self, url, output_path):
        """download_url.

        :param url:
        https://~.pdf
        # https://link.springer.com/book/10.1007/978-1-4612-4360-1/content/pdf/10.1007%2F978-1-4612-4360-1.pdf'

        :param output_path:
        'file save target directory'
        """

        with DownloadProgressBar(
            unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1]
        ) as t:
            urllib.request.urlretrieve(
                url, filename=output_path, reporthook=t.update_to
            )

    def get_page_information(self, url):
        self.browser.open(url)
        sleep(self.sleep_time)

        return self.browser.get_current_page()

    def get_next_page_link(self, result_list_page_bs: bs4.BeautifulSoup):
        if len(self.top_page_urls == 1):
            target_css = result_list_page_bs.select(FIRST_NEXT_PAGE_SELECTOR)

            if target_css[0]["class"][0] == "next":  # Is it start case?
                return BASE_URL + target_css[0]["href"]

        else:
            try:
                target_css = result_list_page_bs.select(NEXT_PAGE_SELECTOR)
            except:
                print("Maybe Finished")

                return False

            return BASE_URL + target_css[0]["href"]

    def get_detail_page_urls(self, result_list_page_bs):
        return [
            BASE_URL + target_css["href"]
            for target_css in result_list_page_bs.select("#results-list")[
                0
            ].find_all("a", {"class": "title"})
        ]

    def get_pdf_url_link(self, detail_page_html_bs):
        return (
            BASE_URL
            + detail_page_html_bs.select(PDF_LINK_SELECTOR_IN_BOOK_INFO)[0][
                "href"
            ]
        )

    def get_epub_url_link(self, detail_page_html_bs):
        return (
            BASE_URL
            + detail_page_html_bs.select(EPUB_LINK_SELECTOR_IN_BOOK_INFO)[0][
                "href"
            ]
        )

    def get_pdf_name(self, detail_page_html_bs):
        return (
            self.data_directory
            + detail_page_html_bs.select(TITLE_SELECTOR_IN_BOOK_INFO)[
                0
            ].text.replace(" ", "_")
            + ".pdf"
        )

    def get_epub_name(self, detail_page_html_bs):
        return (
            self.data_directory
            + detail_page_html_bs.select(TITLE_SELECTOR_IN_BOOK_INFO)[
                0
            ].text.replace(" ", "_")
            + ".epub"
        )

In [306]:
scraper = Scraper(url=START_URL)
scraper.collect_detail_page_urls()


0
20


KeyboardInterrupt: 

In [307]:
scraper.collect_pdf_file_urls()

 25%|██▌       | 5/20 [00:57<02:51, 11.42s/it]


KeyboardInterrupt: 

In [309]:
scraper.download_pdfs()

file-loop:   0%|          | 0/5 [00:00<?, ?it/s]
10.1007%2F978-1-4842-4932-1.epub: 0.00B [00:00, ?B/s][A
10.1007%2F978-1-4842-4932-1.epub:   0%|          | 8.19k/6.76M [00:02<34:06, 3.30kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   1%|          | 49.2k/6.76M [00:02<23:48, 4.69kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   1%|▏         | 90.1k/6.76M [00:02<16:39, 6.67kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   2%|▏         | 139k/6.76M [00:02<11:38, 9.47kB/s] [A
10.1007%2F978-1-4842-4932-1.epub:   3%|▎         | 197k/6.76M [00:02<08:08, 13.4kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   4%|▎         | 238k/6.76M [00:03<05:45, 18.9kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   4%|▍         | 287k/6.76M [00:03<04:03, 26.5kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   5%|▍         | 336k/6.76M [00:03<02:53, 36.9kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   6%|▌         | 401k/6.76M [00:03<02:04, 51.1kB/s][A
10.1007%2F978-1-4842-4932-1.epub:   7%|▋         | 467k/6.76M [00:03<01:29, 70.4kB/s][A


KeyboardInterrupt: 