Skip to content

Commit

Permalink
Merge pull request #17 from blancadesal/remove-wget
Browse files Browse the repository at this point in the history
Use requests instead of wget to download files (Issue #5)
  • Loading branch information
geohci committed Dec 16, 2021
2 parents b906ab1 + 34c8b4f commit db674af
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 39 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
],
package_dir={"": "src"},
packages=find_packages(where="src"),
install_requires=["wget >= 3.2"],
install_requires=["requests >= 2.26", "tqdm >= 4.62"],
include_package_data=True,
zip_safe=False,
)
63 changes: 27 additions & 36 deletions src/mwsql/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
"""

import gzip
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import Iterator, Optional, TextIO, Union
from urllib.error import HTTPError

import wget # type: ignore
import requests # type: ignore
from tqdm import tqdm # type: ignore

# Custom type
PathObject = Union[str, Path]
Expand Down Expand Up @@ -66,34 +65,33 @@ def head(file_path: PathObject, n_lines: int = 10, encoding: str = "utf-8") -> N
return


def _progress_bar(
current: Union[int, float], total: Union[int, float], width: int = 60
) -> None:
def download_file(url: str, file_name: str) -> Optional[Path]:
"""
Custom progress bar for wget downloads.
:param current: bytes downloaded so far
:type current: Union[int, float]
:param total: Total size of download in bytes or megabytes
:type total: Union[int, float]
:param width: Progress bar width in chars, defaults to 60
:type width: int, optional
Download a file from a URL and show a progress indicator. Return the path to the downloaded file.
:param url: URL to download from
:param file_name: name of the file to download
:return: path to the downloaded file
"""

unit = "bytes"
session = requests.Session()
response = session.get(url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get("content-length", 0))
block_size = 4096
progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True)

# Show file size in MB for large files
if total >= 100000:
MB = 1024 * 1024
current = current / MB
total = total / MB
unit = "MB"
with open(file_name, "wb") as outfile:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
outfile.write(data)
progress_bar.close()

progress = current / total
progress_message = f"Progress: \
{progress:.0%} [{current:.1f} / {total:.1f}] {unit}"
sys.stdout.write("\r" + progress_message)
sys.stdout.flush()
if total_size != 0 and progress_bar.n != total_size:
raise RuntimeError(
f"Downloaded {progress_bar.n} bytes, expected {total_size} bytes"
)

return Path(file_name)


def load(
Expand Down Expand Up @@ -126,15 +124,8 @@ def load(
file_path = Path(extended_filename)

if paws_root_dir.exists():
dump_file = Path(paws_root_dir, subdir, file_path)
return Path(paws_root_dir, subdir, file_path)

else:
url = f"{dumps_url}{str(subdir)}/{str(file_path)}"
try:
print(f"Downloading {url}")
dump_file = wget.download(url, bar=_progress_bar)
except HTTPError as e:
print(f"HTTPError: {e}")
raise

return Path(dump_file)
url = f"{dumps_url}{str(subdir)}/{str(extended_filename)}"
return download_file(url, extended_filename)
7 changes: 5 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
from pathlib import Path, PosixPath
from urllib.error import HTTPError

import pytest
import requests

from mwsql.utils import _open_file, head, load

from .helpers import Capturing

# from urllib.error import HTTPError


CURRENT_DIR = Path(__file__).parent
DATA_DIR = CURRENT_DIR.parent / "data"
FILEPATH_GZ = DATA_DIR / "testfile.sql.gz"
Expand Down Expand Up @@ -42,7 +45,7 @@ def test_load(database, filename, date, extension, expected):


def test_load_HTTPError():
with pytest.raises(HTTPError):
with pytest.raises(requests.exceptions.HTTPError):
load("simplewiki", "non-existing-filename", "latest")


Expand Down

0 comments on commit db674af

Please sign in to comment.