-
Notifications
You must be signed in to change notification settings - Fork 424
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support download dataset from origin URL in standalone mode (#828)
* Support download dataset from origin URL in standalone mode
- Loading branch information
1 parent
4bd128b
commit 13b6b4f
Showing
10 changed files
with
428 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import hashlib | ||
import logging | ||
import os | ||
import shutil | ||
import sys | ||
import tarfile | ||
import urllib | ||
import zipfile | ||
from urllib.request import urlretrieve | ||
|
||
from tqdm import tqdm | ||
|
||
logger = logging.getLogger("graphscope") | ||
|
||
|
||
if sys.version_info >= (3, 6): | ||
|
||
def _path_to_string(path): | ||
if isinstance(path, os.PathLike): | ||
return os.fspath(path) | ||
return path | ||
|
||
|
||
elif sys.version_info >= (3, 4): | ||
|
||
def _path_to_string(path): | ||
import pathlib | ||
|
||
if isinstance(path, pathlib.Path): | ||
return str(path) | ||
return path | ||
|
||
|
||
else: | ||
|
||
def _path_to_string(path): | ||
return path | ||
|
||
|
||
def _resolve_hasher(algorithm, file_hash=None): | ||
"""Returns hash algorithm as hashlib function.""" | ||
if algorithm == "sha256": | ||
return hashlib.sha256() | ||
if algorithm == "auto" and file_hash is not None and len(file_hash) == 64: | ||
return hashlib.sha256() | ||
return hashlib.md5() | ||
|
||
|
||
def _hash_file(fpath, algorithm="sha256", chunk_size=65535): | ||
"""Calculates a file sha256 or md5 hash. | ||
Examples: | ||
..code:: python | ||
>>> _hash_file("/path/to/file") | ||
"ccd128ab673e5d7dd1cceeaa4ba5d65b67a18212c4a27b0cd090359bd7042b10" | ||
""" | ||
if isinstance(algorithm, str): | ||
hasher = _resolve_hasher(algorithm) | ||
else: | ||
hasher = algorithm | ||
with open(fpath, "rb") as file: | ||
for chunk in iter(lambda: file.read(chunk_size), b""): | ||
hasher.update(chunk) | ||
return hasher.hexdigest() | ||
|
||
|
||
def _extract_archive(fpath, path=".", archive_format="auto"): | ||
"""Extracts an archive if it matches tar.gz, tar.bz, tar, or zip formats. | ||
Args: | ||
fpath: `PathLike` object of the archive file. | ||
path: path to extract the archive file. | ||
archive_format (str): Archive format to try for extracting the file. | ||
Options are "tar", "zip", "auto" or None. | ||
"tar" includes tar, tar.gz, tar.bz files. | ||
The default "auto" is ["tar", "zip"]. | ||
None or an empty list will return no matches found. | ||
Returns: | ||
True if a match was found and an archive extraction was completed, | ||
False otherwise. | ||
""" | ||
if archive_format is None: | ||
return False | ||
if archive_format == "auto": | ||
archive_format = ["tar", "zip"] | ||
if isinstance(archive_format, str): | ||
archive_format = [archive_format] | ||
|
||
fpath = _path_to_string(fpath) | ||
path = _path_to_string(path) | ||
|
||
for archive_type in archive_format: | ||
if archive_type == "tar": | ||
open_fn = tarfile.open | ||
is_match_fn = tarfile.is_tarfile | ||
if archive_type == "zip": | ||
open_fn = zipfile.ZipFile | ||
is_match_fn = zipfile.is_zipfile | ||
|
||
if is_match_fn(fpath): | ||
with open_fn(fpath) as archive: | ||
try: | ||
archive.extractall(path) | ||
except (tarfile.TarError, RuntimeError, KeyboardInterrupt): | ||
if os.path.exists(path): | ||
if os.path.isfile(path): | ||
os.remove(path) | ||
else: | ||
shutil.rmtree(path) | ||
raise | ||
return True | ||
return False | ||
|
||
|
||
def validate_file(fpath, file_hash, algorithm="auto", chunk_size=65535): | ||
"""Validates a file against a sha256 or md5 hash. | ||
Args: | ||
fpath: `PathLike` object of the file being validated | ||
file_hash (str): The expected hash string of the file. | ||
The sha256 and md5 hash algorithms are both supported. | ||
algorithm (str): Hash algorithm, one of "auto", "sha256", or "md5". | ||
The default "auto" detects the hash algorithm in use. | ||
chunk_size (int): Bytes to read at a time, important for large files. | ||
Returns (bool): Whether the file is valid. | ||
""" | ||
hasher = _resolve_hasher(algorithm, file_hash) | ||
if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash): | ||
return True | ||
return False | ||
|
||
|
||
def download_file( | ||
fname, | ||
origin, | ||
file_hash=None, | ||
hash_algorithm="auto", | ||
extract=False, | ||
archive_format="auto", | ||
cache_dir=None, | ||
cache_subdir="datasets", | ||
): | ||
"""Downloads a file from a URL if it not already in the cache. | ||
By default the file at the url `origin` is downloaded to the cache_dir | ||
`~/.graphscope, placed in the cache_subdir `datasets`, and given the | ||
filename `fname`. The final location of a file `example.txt` would | ||
therefore be `~/.graphscope/datsets/example.txt` | ||
File in tar, tar.gz, tar.bz, and zip formats can also be extracted. | ||
Passing a hash will verify the file after download. The command line | ||
programs `shasum` and `sha256sum` can compute the hash. | ||
Args: | ||
fname: `PathLike` object of the file. If an absolute path `/path/to/file` | ||
is specified the file will be saved at that location. | ||
origin (str): Original URL of the file. | ||
file_hash (str): The excepted hash string of the file after download. | ||
The sha256 and md5 hash algorithms are both supported. | ||
hash_algorithm (str): Select the hash algorithm to verify the file. | ||
Options are `"md5"`, `"sha256"`, and `"auto"` | ||
The default "auto" detects the hash algorithm in use. | ||
extract (bool): True tries extracting the file as an Archive, like zip. | ||
archive_format (str): Archive format to try for extracting the file. | ||
Options are `"auto"` `"tar"` `"zip"` and `None`. | ||
`"tar"` includes "tar", "tar.gz", and "tar.bz" files. | ||
The default `"auto"` corresponds to `["tar", "zip"]`. | ||
None or an empty list will return no matches found. | ||
cache_dir: Location of `PathLike` object to store cached files, when None, | ||
it defaults to the default directory `~/.graphscope` | ||
cache_subdir: Subdirectory under the cache dir where the file is saved. | ||
Returns: | ||
Path to the download file. | ||
""" | ||
if cache_dir is None: | ||
cache_dir = os.path.join(os.path.expanduser("~"), ".graphscope") | ||
cache_dir = os.path.expanduser(cache_dir) | ||
if os.path.exists(cache_dir) and not os.access(cache_dir, os.W_OK): | ||
cache_dir = os.path.join("/tmp", ".graphscope") | ||
datadir = os.path.join(cache_dir, cache_subdir) | ||
os.makedirs(datadir, exist_ok=True) | ||
|
||
fname = _path_to_string(fname) | ||
fpath = os.path.join(datadir, fname) | ||
|
||
download = False | ||
if os.path.exists(fpath): | ||
# file found, verify if a hash was provided | ||
if file_hash is not None: | ||
if not validate_file(fpath, file_hash, algorithm=hash_algorithm): | ||
logger.warning( | ||
"A local file was found, but it seems to be incomplete " | ||
"or outdated because the %s file hash does not match the " | ||
"original value of %s, so we will re-download the data.", | ||
hash_algorithm, | ||
file_hash, | ||
) | ||
download = True | ||
else: | ||
download = True | ||
|
||
if download: | ||
logger.info("Downloading data from %s", origin) | ||
|
||
class ProgressTracker(object): | ||
# Maintain progbar for the lifetime of download | ||
progbar = None | ||
record_downloaded = None | ||
|
||
def show_progress(block_num, block_size, total_size): | ||
if ProgressTracker.progbar is None: | ||
ProgressTracker.progbar = tqdm( | ||
total=total_size, unit="iB", unit_scale=True | ||
) | ||
downloaded = min(block_num * block_size, total_size) | ||
if ProgressTracker.record_downloaded is None: | ||
ProgressTracker.record_downloaded = downloaded | ||
update_downloaded = downloaded | ||
else: | ||
update_downloaded = downloaded - ProgressTracker.record_downloaded | ||
ProgressTracker.record_downloaded = downloaded | ||
ProgressTracker.progbar.update(update_downloaded) | ||
if downloaded >= total_size: | ||
ProgressTracker.progbar.close() | ||
ProgressTracker.progbar = None | ||
ProgressTracker.record_downloaded = None | ||
|
||
error_msg = "URL fetch failure on {}:{} -- {}" | ||
try: | ||
try: | ||
urlretrieve(origin, fpath, show_progress) | ||
except urllib.error.HTTPError as e: | ||
raise Exception(error_msg.format(origin, e.code, e.msg)) | ||
except urllib.error.URLError as e: | ||
# `URLError` has been made a subclass of OSError since version 3.3 | ||
# https://docs.python.org/3/library/urllib.error.html | ||
raise Exception(error_msg.format(origin, e.errno, e.reason)) | ||
except (Exception, KeyboardInterrupt): | ||
if os.path.exists(fpath): | ||
os.remove(fpath) | ||
raise | ||
|
||
if extract: | ||
_extract_archive(fpath, datadir, archive_format) | ||
|
||
return fpath |
Oops, something went wrong.