From 03596df5783222703b38db4027f5a75aa3b09ff5 Mon Sep 17 00:00:00 2001 From: Matthew Peveler Date: Fri, 14 Apr 2017 02:52:23 -0400 Subject: [PATCH] Change mirrors and make it easier for end users to switch (#82) Fixes #81 --- .gitignore | 2 ++ gutenberg/_util/decorators.py | 13 +++++++++ gutenberg/acquire/text.py | 51 ++++++++++++++++++++++++----------- tests/test_acquire.py | 27 +++++++++++++++++++ 4 files changed, 77 insertions(+), 16 deletions(-) create mode 100644 gutenberg/_util/decorators.py diff --git a/.gitignore b/.gitignore index 0169fb3..9a00f5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.pyc /virtualenv/ +/venv/ /dist/ +/build/ /*.egg-info /.idea/ .coverage diff --git a/gutenberg/_util/decorators.py b/gutenberg/_util/decorators.py new file mode 100644 index 0000000..c86d856 --- /dev/null +++ b/gutenberg/_util/decorators.py @@ -0,0 +1,13 @@ +"""Module to deal with decorators""" + + +def execute_only_once(func): + """Decorator that will only allow a function to be executed the first time it's called""" + def call_once(*args, **kwargs): + if not call_once._called: + try: + return func(*args, **kwargs) + finally: + call_once._called = True + call_once._called = False + return call_once diff --git a/gutenberg/acquire/text.py b/gutenberg/acquire/text.py index 2805d02..9720889 100644 --- a/gutenberg/acquire/text.py +++ b/gutenberg/acquire/text.py @@ -12,23 +12,27 @@ from gutenberg._domain_model.exceptions import UnknownDownloadUriException from gutenberg._domain_model.persistence import local_path from gutenberg._domain_model.types import validate_etextno +from gutenberg._util.decorators import execute_only_once from gutenberg._util.os import makedirs from gutenberg._util.os import remove - _TEXT_CACHE = local_path('text') +_GUTENBERG_MIRROR = 'http://aleph.gutenberg.org' def _etextno_to_uri_subdirectory(etextno): """ - For example, ebook #1 is in subdirectory: - 0/1 - - And ebook #19 is in subdirectory: - 1/19 - - While ebook #15453 is in this subdirectory: - 1/5/4/5/15453 + Returns the subdirectory that an etextno will be found in a gutenberg mirror. Generally, one + finds the subdirectory by separating out each digit of the etext number, and uses it for + a directory. The exception here is for etext numbers less than 10, which are prepended with a + 0 for the directory traversal. + + >>> _etextno_to_uri_subdirectory(1) + '0/1' + >>> _etextno_to_uri_subdirectory(19) + '1/19' + >>> _etextno_to_uri_subdirectory(15453) + '1/5/4/5/15453' """ str_etextno = str(etextno).zfill(2) all_but_last_digit = list(str_etextno[:-1]) @@ -37,15 +41,27 @@ def _etextno_to_uri_subdirectory(etextno): return subdir -def _format_download_uri(etextno): +@execute_only_once +def _check_mirror_exists(mirror): + response = requests.head(mirror) + if not response.ok: + error = "Could not reach Gutenberg mirror '{0:s}'. Try setting a different mirror " \ + "(https://www.gutenberg.org/MIRRORS.ALL) for --mirror flag or " \ + "GUTENBERG_MIRROR environment variable.".format(mirror) + raise UnknownDownloadUriException(error) + + +def _format_download_uri(etextno, mirror=None): """Returns the download location on the Project Gutenberg servers for a given text. Raises: UnknownDownloadUri: If no download location can be found for the text. - """ - uri_root = r'http://www.gutenberg.lib.md.us' + uri_root = mirror or _GUTENBERG_MIRROR + uri_root = uri_root.strip().rstrip('/') + _check_mirror_exists(uri_root) + extensions = ('.txt', '-8.txt', '-0.txt') for extension in extensions: path = _etextno_to_uri_subdirectory(etextno) @@ -57,10 +73,10 @@ def _format_download_uri(etextno): response = requests.head(uri) if response.ok: return uri - raise UnknownDownloadUriException + raise UnknownDownloadUriException('Failed to find {0} on {1}.'.format(etextno, uri_root)) -def load_etext(etextno, refresh_cache=False): +def load_etext(etextno, refresh_cache=False, mirror=None): """Returns a unicode representation of the full body of a Project Gutenberg text. After making an initial remote call to Project Gutenberg's servers, the text is persisted locally. @@ -73,7 +89,7 @@ def load_etext(etextno, refresh_cache=False): remove(cached) if not os.path.exists(cached): makedirs(os.path.dirname(cached)) - download_uri = _format_download_uri(etextno) + download_uri = _format_download_uri(etextno, mirror) response = requests.get(download_uri) text = response.text with closing(gzip.open(cached, 'w')) as cache: @@ -95,10 +111,13 @@ def _main(): parser = ArgumentParser(description='Download a Project Gutenberg text') parser.add_argument('etextno', type=int) parser.add_argument('outfile', type=FileType('w')) + parser.add_argument('--mirror', '-m', type=str) args = parser.parse_args() + mirror = args.mirror or os.environ.get('GUTENBERG_MIRROR') + try: - text = load_etext(args.etextno) + text = load_etext(args.etextno, mirror=mirror) with reopen_encoded(args.outfile, 'w', 'utf8') as outfile: outfile.write(text) except Error as error: diff --git a/tests/test_acquire.py b/tests/test_acquire.py index 1af88c5..f422730 100644 --- a/tests/test_acquire.py +++ b/tests/test_acquire.py @@ -6,7 +6,9 @@ from __future__ import absolute_import, unicode_literals from builtins import str import itertools +from collections import namedtuple +from gutenberg._domain_model.exceptions import UnknownDownloadUriException from gutenberg._domain_model.vocabulary import DCTERMS from gutenberg._domain_model.vocabulary import PGTERMS from tests._sample_metadata import SampleMetaData @@ -14,6 +16,7 @@ from tests._util import MockTextMixin from tests._util import unittest +from gutenberg.acquire import text from gutenberg.acquire import load_etext from gutenberg.acquire import load_metadata @@ -44,6 +47,30 @@ def test_load_etext(self): self.assertIsInstance(text, str) self.assertNotIn(u'\ufffd', text) + def test_invalid_etext(self): + with self.assertRaises(UnknownDownloadUriException): + text.load_etext(1, mirror='http://example.com') + + +class TestFailLoadEtext(unittest.TestCase): + def setUp(self): + self._original_head = text.requests.head + + def tearDown(self): + text.requests.head = self._original_head + + def request_head_response(self, ok=False): + response = namedtuple('Response', 'ok') + + def head(*args, **kwargs): + return response(ok) + text.requests.head = head + + def test_unreachable_mirror(self): + self.request_head_response(ok=False) + with self.assertRaises(UnknownDownloadUriException): + text.load_etext(1) + if __name__ == '__main__': unittest.main()