Skip to content
This repository has been archived by the owner on Jan 12, 2023. It is now read-only.

Commit

Permalink
Change mirrors and make it easier for end users to switch (#82)
Browse files Browse the repository at this point in the history
Fixes #81
  • Loading branch information
MasterOdin authored and c-w committed Apr 14, 2017
1 parent bc1edee commit 03596df
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 16 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
*.pyc
/virtualenv/
/venv/
/dist/
/build/
/*.egg-info
/.idea/
.coverage
Expand Down
13 changes: 13 additions & 0 deletions gutenberg/_util/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Module to deal with decorators"""


def execute_only_once(func):
"""Decorator that will only allow a function to be executed the first time it's called"""
def call_once(*args, **kwargs):
if not call_once._called:
try:
return func(*args, **kwargs)
finally:
call_once._called = True
call_once._called = False
return call_once
51 changes: 35 additions & 16 deletions gutenberg/acquire/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,27 @@
from gutenberg._domain_model.exceptions import UnknownDownloadUriException
from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.types import validate_etextno
from gutenberg._util.decorators import execute_only_once
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove


_TEXT_CACHE = local_path('text')
_GUTENBERG_MIRROR = 'http://aleph.gutenberg.org'


def _etextno_to_uri_subdirectory(etextno):
"""
For example, ebook #1 is in subdirectory:
0/1
And ebook #19 is in subdirectory:
1/19
While ebook #15453 is in this subdirectory:
1/5/4/5/15453
Returns the subdirectory that an etextno will be found in a gutenberg mirror. Generally, one
finds the subdirectory by separating out each digit of the etext number, and uses it for
a directory. The exception here is for etext numbers less than 10, which are prepended with a
0 for the directory traversal.
>>> _etextno_to_uri_subdirectory(1)
'0/1'
>>> _etextno_to_uri_subdirectory(19)
'1/19'
>>> _etextno_to_uri_subdirectory(15453)
'1/5/4/5/15453'
"""
str_etextno = str(etextno).zfill(2)
all_but_last_digit = list(str_etextno[:-1])
Expand All @@ -37,15 +41,27 @@ def _etextno_to_uri_subdirectory(etextno):
return subdir


def _format_download_uri(etextno):
@execute_only_once
def _check_mirror_exists(mirror):
response = requests.head(mirror)
if not response.ok:
error = "Could not reach Gutenberg mirror '{0:s}'. Try setting a different mirror " \
"(https://www.gutenberg.org/MIRRORS.ALL) for --mirror flag or " \
"GUTENBERG_MIRROR environment variable.".format(mirror)
raise UnknownDownloadUriException(error)


def _format_download_uri(etextno, mirror=None):
"""Returns the download location on the Project Gutenberg servers for a
given text.
Raises:
UnknownDownloadUri: If no download location can be found for the text.
"""
uri_root = r'http://www.gutenberg.lib.md.us'
uri_root = mirror or _GUTENBERG_MIRROR
uri_root = uri_root.strip().rstrip('/')
_check_mirror_exists(uri_root)

extensions = ('.txt', '-8.txt', '-0.txt')
for extension in extensions:
path = _etextno_to_uri_subdirectory(etextno)
Expand All @@ -57,10 +73,10 @@ def _format_download_uri(etextno):
response = requests.head(uri)
if response.ok:
return uri
raise UnknownDownloadUriException
raise UnknownDownloadUriException('Failed to find {0} on {1}.'.format(etextno, uri_root))


def load_etext(etextno, refresh_cache=False):
def load_etext(etextno, refresh_cache=False, mirror=None):
"""Returns a unicode representation of the full body of a Project Gutenberg
text. After making an initial remote call to Project Gutenberg's servers,
the text is persisted locally.
Expand All @@ -73,7 +89,7 @@ def load_etext(etextno, refresh_cache=False):
remove(cached)
if not os.path.exists(cached):
makedirs(os.path.dirname(cached))
download_uri = _format_download_uri(etextno)
download_uri = _format_download_uri(etextno, mirror)
response = requests.get(download_uri)
text = response.text
with closing(gzip.open(cached, 'w')) as cache:
Expand All @@ -95,10 +111,13 @@ def _main():
parser = ArgumentParser(description='Download a Project Gutenberg text')
parser.add_argument('etextno', type=int)
parser.add_argument('outfile', type=FileType('w'))
parser.add_argument('--mirror', '-m', type=str)
args = parser.parse_args()

mirror = args.mirror or os.environ.get('GUTENBERG_MIRROR')

try:
text = load_etext(args.etextno)
text = load_etext(args.etextno, mirror=mirror)
with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
outfile.write(text)
except Error as error:
Expand Down
27 changes: 27 additions & 0 deletions tests/test_acquire.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
from __future__ import absolute_import, unicode_literals
from builtins import str
import itertools
from collections import namedtuple

from gutenberg._domain_model.exceptions import UnknownDownloadUriException
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from tests._sample_metadata import SampleMetaData
from tests._util import MockMetadataMixin
from tests._util import MockTextMixin
from tests._util import unittest

from gutenberg.acquire import text
from gutenberg.acquire import load_etext
from gutenberg.acquire import load_metadata

Expand Down Expand Up @@ -44,6 +47,30 @@ def test_load_etext(self):
self.assertIsInstance(text, str)
self.assertNotIn(u'\ufffd', text)

def test_invalid_etext(self):
with self.assertRaises(UnknownDownloadUriException):
text.load_etext(1, mirror='http://example.com')


class TestFailLoadEtext(unittest.TestCase):
def setUp(self):
self._original_head = text.requests.head

def tearDown(self):
text.requests.head = self._original_head

def request_head_response(self, ok=False):
response = namedtuple('Response', 'ok')

def head(*args, **kwargs):
return response(ok)
text.requests.head = head

def test_unreachable_mirror(self):
self.request_head_response(ok=False)
with self.assertRaises(UnknownDownloadUriException):
text.load_etext(1)


if __name__ == '__main__':
unittest.main()

0 comments on commit 03596df

Please sign in to comment.