-
Notifications
You must be signed in to change notification settings - Fork 4.5k
[BEAM-1988] Migrate from utils.path to BFS #2643
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,10 +61,9 @@ | |
| import sys | ||
| import tempfile | ||
|
|
||
|
|
||
| from apache_beam import utils | ||
| from apache_beam import version as beam_version | ||
| from apache_beam.internal import pickler | ||
| from apache_beam.io.filesystems_util import get_filesystem | ||
| from apache_beam.runners.dataflow.internal import names | ||
| from apache_beam.utils import processes | ||
| from apache_beam.utils.pipeline_options import GoogleCloudOptions | ||
|
|
@@ -158,6 +157,7 @@ def _stage_extra_packages(extra_packages, staging_location, temp_dir, | |
| name patterns. | ||
| """ | ||
| resources = [] | ||
| staging_filesystem = get_filesystem(staging_location) | ||
| staging_temp_dir = None | ||
| local_packages = [] | ||
| for package in extra_packages: | ||
|
|
@@ -190,13 +190,14 @@ def _stage_extra_packages(extra_packages, staging_location, temp_dir, | |
| local_packages.append(package) | ||
|
|
||
| if staging_temp_dir: | ||
| temp_fs = get_filesystem(staging_temp_dir) | ||
| local_packages.extend( | ||
| [utils.path.join(staging_temp_dir, f) for f in os.listdir( | ||
| [temp_fs.join(staging_temp_dir, f) for f in os.listdir( | ||
| staging_temp_dir)]) | ||
|
|
||
| for package in local_packages: | ||
| basename = os.path.basename(package) | ||
| staged_path = utils.path.join(staging_location, basename) | ||
| staged_path = staging_filesystem.join(staging_location, basename) | ||
| file_copy(package, staged_path) | ||
| resources.append(basename) | ||
| # Create a file containing the list of extra packages and stage it. | ||
|
|
@@ -209,7 +210,7 @@ def _stage_extra_packages(extra_packages, staging_location, temp_dir, | |
| with open(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), 'wt') as f: | ||
| for package in local_packages: | ||
| f.write('%s\n' % os.path.basename(package)) | ||
| staged_path = utils.path.join(staging_location, EXTRA_PACKAGES_FILE) | ||
| staged_path = staging_filesystem.join(staging_location, EXTRA_PACKAGES_FILE) | ||
| # Note that the caller of this function is responsible for deleting the | ||
| # temporary folder where all temp files are created, including this one. | ||
| file_copy(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), staged_path) | ||
|
|
@@ -284,13 +285,15 @@ def stage_job_resources( | |
| raise RuntimeError( | ||
| 'The --temp_location option must be specified.') | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we check for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll investigate.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you does not need to happen in this PR if it is involved.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a sanity check as https://github.com/apache/beam/blob/master/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py#L325 would have already assigned temp_location to staging_location had it not been specified. I think we can leave it as is right now. |
||
|
|
||
| filesystem = get_filesystem(google_cloud_options.staging_location) | ||
|
|
||
| # Stage a requirements file if present. | ||
| if setup_options.requirements_file is not None: | ||
| if not os.path.isfile(setup_options.requirements_file): | ||
| raise RuntimeError('The file %s cannot be found. It was specified in the ' | ||
| '--requirements_file command line option.' % | ||
| setup_options.requirements_file) | ||
| staged_path = utils.path.join(google_cloud_options.staging_location, | ||
| staged_path = filesystem.join(google_cloud_options.staging_location, | ||
| REQUIREMENTS_FILE) | ||
| file_copy(setup_options.requirements_file, staged_path) | ||
| resources.append(REQUIREMENTS_FILE) | ||
|
|
@@ -305,7 +308,7 @@ def stage_job_resources( | |
| populate_requirements_cache( | ||
| setup_options.requirements_file, requirements_cache_path) | ||
| for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): | ||
| file_copy(pkg, utils.path.join(google_cloud_options.staging_location, | ||
| file_copy(pkg, filesystem.join(google_cloud_options.staging_location, | ||
| os.path.basename(pkg))) | ||
| resources.append(os.path.basename(pkg)) | ||
|
|
||
|
|
@@ -324,7 +327,7 @@ def stage_job_resources( | |
| 'setup.py instead of %s' % setup_options.setup_file) | ||
| tarball_file = _build_setup_package(setup_options.setup_file, temp_dir, | ||
| build_setup_args) | ||
| staged_path = utils.path.join(google_cloud_options.staging_location, | ||
| staged_path = filesystem.join(google_cloud_options.staging_location, | ||
| WORKFLOW_TARBALL_FILE) | ||
| file_copy(tarball_file, staged_path) | ||
| resources.append(WORKFLOW_TARBALL_FILE) | ||
|
|
@@ -344,7 +347,7 @@ def stage_job_resources( | |
| pickled_session_file = os.path.join(temp_dir, | ||
| names.PICKLED_MAIN_SESSION_FILE) | ||
| pickler.dump_session(pickled_session_file) | ||
| staged_path = utils.path.join(google_cloud_options.staging_location, | ||
| staged_path = filesystem.join(google_cloud_options.staging_location, | ||
| names.PICKLED_MAIN_SESSION_FILE) | ||
| file_copy(pickled_session_file, staged_path) | ||
| resources.append(names.PICKLED_MAIN_SESSION_FILE) | ||
|
|
@@ -359,7 +362,7 @@ def stage_job_resources( | |
| else: | ||
| stage_tarball_from_remote_location = False | ||
|
|
||
| staged_path = utils.path.join(google_cloud_options.staging_location, | ||
| staged_path = filesystem.join(google_cloud_options.staging_location, | ||
| names.DATAFLOW_SDK_TARBALL_FILE) | ||
| if stage_tarball_from_remote_location: | ||
| # If --sdk_location is not specified then the appropriate package | ||
|
|
||
This file was deleted.
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if
temp_locationis not a gcs location? It is an error by itself but the join part was handled correctly in the previous code.I wonder if there is any value in something like
bfs.joinwherebfswould call the rightjoinmethod for the given type, avoiding calls toget_file_system.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think me and Cham discussed it at some point. I can work on that in another PR and clean the get_filesystem infiltration everywhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you.