Merge pull request #1744 from davidmarin/v0.6.2-release

v0.6.2 release
Yelp · Mar 23, 2018 · 54dc657 · 54dc657
2 parents 87a6fec + fe35641
commit 54dc657
Show file tree

Hide file tree

Showing 31 changed files with 187 additions and 69 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,13 +1,31 @@
-v0.6.2, 2018-03-?? -- farewell google-api-python-client
+v0.6.2, 2018-03-23 -- log parsing at scale
  * runners:
-   * dataproc:
-     * replaced google-api-python-client with google-cloud-sdk
-       * GCSFilesystem method changes:
-         * api_client attr has been replaced with client
-         * create_bucket() no longer takes a project ID
-         * delete_bucket() is disabled (use get_bucket(...).delete())
-         * get_bucket() returns a google.cloud.storage.bucket.Bucket
-         * list_buckets() is disabled (use get_all_bucket_names())
+   * local runners
+     * added --num-cores option to control parallelism and file splits (#1727)
+   * cloud runners (EMR and Dataproc):
+     * idle timeout script has 10-minute grace period (#1694)
+   * Dataproc:
+     * replaced google-api-python-client with google-cloud-sdk (#1730)
+     * works without gcloud util config installed (#1742)
+       * credentials can be read from $GOOGLE_APPLICATION_CREDENTIALS
+       * or from gcloud util config (if installed)
+     * no longer required to set region or zone (#1732)
+       * auto zone placement (just set region) is enabled
+       * defaults to auto zone placement in us-west1
+       * no longer reads zone or region from gcloud GCE configs
+     * Dataproc Quickstart is now up-to-date (#1589)
+     * api_client attr has been replaced with cluster_client and job_client
+     * GCSFilesystem method changes:
+       * api_client attr has been replaced with client
+       * create_bucket() no longer takes a project ID
+       * delete_bucket() is disabled (use get_bucket(...).delete())
+       * get_bucket() returns a google.cloud.storage.bucket.Bucket
+       * list_buckets() is disabled (use get_all_bucket_names())
+  * EMR:
+    * much faster error log parsing (#1706)
+      * may have to wait for logs to transfer to S3 on some AMIs
+  * tools:
+    * terminate-idle-job-flows is faster and uses less API calls
 
 v0.6.1, 2017-11-27 -- mrjob diagnose
  * fixed serious error log parsing issue (#1708)

diff --git a/docs/guides/configs-hadoopy-runners.rst b/docs/guides/configs-hadoopy-runners.rst
@@ -13,7 +13,7 @@ Options specific to the local and inline runners
     :config: hadoop_version
     :switch: --hadoop-version
     :type: :ref:`string <data-type-string>`
-    :set: emr
+    :set: local
     :default: ``None``
 
     Set the version of Hadoop to simulate (this currently only matters for
@@ -24,6 +24,21 @@ Options specific to the local and inline runners
     the runner sets a simulated jobconf variable, it'll use *every* possible
     name for it (e.g. ``user.name`` *and* ``mapreduce.job.user.name``).
 
+.. mrjob-opt::
+   :config: num_cores
+   :switch: --num-cores
+   :type: integer
+   :set: local
+   :default: ``None``
+
+   Maximum number of tasks to handle at one time. If not set, defaults to the
+   number of CPUs on your system.
+
+   This also affects the number of input file splits the runner makes (the
+   only impact in ``inline`` mode).
+
+   .. versionadded:: 0.6.2
+
 
 Options available to local, hadoop, and emr runners
 ---------------------------------------------------

diff --git a/docs/guides/dataproc-opts.rst b/docs/guides/dataproc-opts.rst
@@ -28,14 +28,28 @@ Choosing/creating a cluster to join
 Cluster creation and configuration
 -----------------------------------
 
+.. mrjob-opt::
+    :config: project_id
+    :switch: --project-id
+    :type: :ref:`string <data-type-string>`
+    :set: dataproc
+    :default: read from credentials config file
+
+    The ID of the Google Cloud Project to run under.
+
+    .. versionchanged:: 0.6.2
+
+       This used to be called *gcp_project*
+
 .. mrjob-opt::
     :config: zone
     :switch: --zone
     :type: :ref:`string <data-type-string>`
     :set: dataproc
     :default: gcloud SDK default
 
-    Availability zone to run the job in
+    Availability zone to run the job in. If you set this, you do not neet
+    to set :mrjob-opt:`region`.
 
 .. mrjob-opt::
     :config: region
@@ -44,9 +58,9 @@ Cluster creation and configuration
     :set: dataproc
     :default: gcloud SDK default
 
-    region to run Dataproc jobs on (e.g.  ``us-central-1``). Also used by mrjob
-    to create temporary buckets if you don't set :mrjob-opt:`cloud_tmp_dir`
-    explicitly.
+    region to run Dataproc jobs on (e.g.  ``us-central-1``). Setting region
+    enables auto zone placement: a :mrjob-opt:`zone`) within the region will
+    be picked for you.
 
 .. mrjob-opt::
     :config: image_version

diff --git a/docs/guides/dataproc-quickstart.rst b/docs/guides/dataproc-quickstart.rst
@@ -14,8 +14,8 @@ Creating an account
 
 * Go to `cloud.google.com <https://cloud.google.com>`__.
 * Click the circle in the upper right, and select your Google account (if you
-  don't have one sign up `here <https://accounts.google.com/SignUp>`__.
-* `If you have multiple Google accounts, sign out first, and then sign into
+  don't have one sign up `here <https://accounts.google.com/SignUp>`__. `If
+  you have multiple Google accounts, sign out first, and then sign into
   the account you want to use.`
 * Click **Try it Free** in the upper right
 * Enter your name and payment information
@@ -31,16 +31,17 @@ Enabling Google Cloud Dataproc
 Creating credentials
 ^^^^^^^^^^^^^^^^^^^^
 
-* Go `here <https://console.cloud.google.com/apis/credentials>__` (or pick **APIs & Services > Credentials** in the upper left-hand menu)
+* Go `here <https://console.cloud.google.com/apis/credentials>`__ (or pick **APIs & Services > Credentials** in the upper left-hand menu)
 * Pick **Create credentials > Service account key**
 * Select **Compute engine default service account**
 * Click **Create** to download a JSON file.
 * Point **$GOOGLE_APPLICATION_CREDENTIALS** at the file you downloaded (``export GOOGLE_APPLICATION_CREDENTIALS="/path/to/Your Credentials.json"``).
 
 You do not have to download or install the :command:`gcloud` utility, but if
 you have it installed and configured, mrjob can read credentials from its
-config files rather than **$GOOGLE_APPLICATION_CREDENTIALS**. See
-`Installing Cloud SDK <https://cloud.google.com/sdk/downloads>__` for more
+config files rather than **$GOOGLE_APPLICATION_CREDENTIALS**.
+
+See `Installing Cloud SDK <https://cloud.google.com/sdk/downloads>__` for more
 information.
 
 .. _running-a-dataproc-job:

diff --git a/docs/whats-new.rst b/docs/whats-new.rst
@@ -4,6 +4,35 @@ What's New
 For a complete list of changes, see `CHANGES.txt
 <https://github.com/Yelp/mrjob/blob/master/CHANGES.txt>`_
 
+.. _v0.6.2:
+
+0.6.2
+-----
+
+mrjob is now orders of magnitude quicker at parsing logs, making it practical
+to diagnose rare errors from very large jobs. However, on some AMIs, it can no
+longer parse errors without waiting for logs to transfer to S3 (this may be
+fixed in a future version).
+
+To run jobs on Google Cloud Dataproc, mrjob no longer requires you to install
+the :command:`gcloud` util (though if
+you do have it installed, mrjob can read credentials from its configs). For
+details, see :doc:`guides/dataproc-quickstart`.
+
+mrjob no longer requires you to select a Dataproc :mrjob-opt:`zone` prior
+to running jobs. Auto zone placement (just set :mrjob-opt:`region` and let
+Dataproc pick a zone) is now enabled, with the default being auto zone
+placement in ``us-west1``. mrjob no longer reads zone and region from
+:command:`gcloud`\'s compute engine configs.
+
+mrjob's Dataproc code has been ported from the ``google-python-api-client``
+library (which is in maintenance mode) to ``google-cloud-sdk``, resulting in
+some small changes to the GCS filesystem API. See `CHANGES.txt
+<https://github.com/Yelp/mrjob/blob/master/CHANGES.txt>`_ for details.
+
+Local mode now has a :mrjob-opt:`num_cores` option that allow you to control
+how tasks it handles simultaneously.
+
 .. _v0.6.1:
 
 0.6.1

diff --git a/mrjob/__init__.py b/mrjob/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2009-2017 Yelp and Contributors
+# Copyright 2009-2018 Yelp and Contributors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -122,9 +122,10 @@
     'William Vambenepe <vbp@google.com>',
     'Daniel Vetter <lord_fright@yahoo.com>',
     'Paul Wais <pwais@yelp.com>',
+    'Qiu Wei <qiuwei.cw@alibaba-inc.com>',
     'Derek Wilson <jderekwilson@gmail.com>',
     'Tao Yu <taoyu@yelp.com>',
     'Andrea Zonca <andrea.zonca@gmail.com>',
 ]
 
-__version__ = '0.6.2.dev0'
+__version__ = '0.6.2'
diff --git a/mrjob/bootstrap/terminate_idle_cluster_dataproc.sh b/mrjob/bootstrap/terminate_idle_cluster_dataproc.sh
@@ -5,6 +5,7 @@
 # Copyright 2015 Yelp and Contributors
 # Copyright 2016 Google
 # Copyright 2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,8 +22,9 @@
 # Author: Matthew Tai <mtai84@gmail.com>
 
 # This script is part of mrjob, but can be run as an initializationAction on
-# ANY GCP Dataproc cluster.  Because initializationAction scripts cannot take args
-# this script reads MAX_SECS_IDLE from metadata attribute "mrjob-max-secs-idle"
+# ANY GCP Dataproc cluster.  Because initializationAction scripts cannot take
+# args, this script reads MAX_SECS_IDLE from metadata attribute
+# "mrjob-max-secs-idle"
 
 # This script runs `yarn application -list` in a loop and considers the cluster
 # idle if no jobs are currently running.  If the cluster stays idle long

diff --git a/mrjob/cloud.py b/mrjob/cloud.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright 2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/mrjob/dataproc.py b/mrjob/dataproc.py
@@ -166,7 +166,7 @@ def _check_and_fix_fs_dir(gcs_uri):
 def _zone_to_region(zone):
     """Convert a zone (like us-west1-b) to the corresponding region
     (like us-west1)."""
-    # See https://cloud.google.com/compute/docs/regions-zones/#identifying_a_region_or_zone  #noqa
+    # See https://cloud.google.com/compute/docs/regions-zones/#identifying_a_region_or_zone  # noqa
     return '-'.join(zone.split('-')[:-1])
 
 
@@ -361,7 +361,7 @@ def fs(self):
             return self._fs
 
         self._gcs_fs = GCSFilesystem(
-            credentials = self._credentials,
+            credentials=self._credentials,
             local_tmp_dir=self._get_local_tmp_dir(),
             project_id=self._project_id,
         )
@@ -835,7 +835,7 @@ def _cluster_create_kwargs(self):
         cluster_metadata['mrjob-max-secs-idle'] = str(int(
             self._opts['max_mins_idle'] * 60))
 
-        gce_cluster_config=dict(
+        gce_cluster_config = dict(
             service_account_scopes=_DEFAULT_GCE_SERVICE_ACCOUNT_SCOPES,
             metadata=cluster_metadata
         )

diff --git a/mrjob/emr.py b/mrjob/emr.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 # Copyright 2009-2017 Yelp and Contributors
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/mrjob/fs/gcs.py b/mrjob/fs/gcs.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # Copyright 2016 Google Inc.
 # Copyright 2017 Yelp
+# Copyright 2018 Google Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,10 +23,8 @@
 from mrjob.cat import decompress
 from mrjob.fs.base import Filesystem
 from mrjob.parse import urlparse
-from mrjob.py2 import PY2
 from mrjob.runner import GLOB_RE
 
-# TODO: loading credentials
 try:
     import google.cloud.storage.client
     from google.api_core.exceptions import NotFound

diff --git a/mrjob/fs/s3.py b/mrjob/fs/s3.py
@@ -1,5 +1,6 @@
 # Copyright 2009-2016 Yelp and Contributors
 # Copyright 2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/mrjob/local.py b/mrjob/local.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # Copyright 2009-2013 Yelp and Contributors
 # Copyright 2015-2017 Yelp
+# Copyright 2018 Yelp and Contributors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,7 +69,6 @@ class LocalMRJobRunner(SimMRJobRunner, MRJobBinRunner):
         'sort_bin',
     }
 
-
     def __init__(self, **kwargs):
         """Arguments to this constructor may also appear in :file:`mrjob.conf`
         under ``runners/local``.
@@ -85,12 +85,8 @@ def __init__(self, **kwargs):
           require Java. If you need to test these, consider starting up a
           standalone Hadoop instance and running your job with ``-r hadoop``.
         """
-        self.NUM_CORES = kwargs.get('num_cores')
         super(LocalMRJobRunner, self).__init__(**kwargs)
 
-    def _get_num_cores(self):
-        return self.NUM_CORES if self.NUM_CORES else None
-
     def _invoke_task_func(self, task_type, step_num, task_num):
         args = self._substep_args(step_num, task_type)
         num_steps = self._num_steps()
@@ -103,7 +99,7 @@ def _invoke_task_func(self, task_type, step_num, task_num):
 
     def _run_multiple(self, funcs, num_processes=None):
         """Use multiprocessing to run in parallel."""
-        pool = Pool(processes=self._get_num_cores())
+        pool = Pool(processes=self._opts['num_cores'])
 
         try:
             results = [

diff --git a/mrjob/logs/errors.py b/mrjob/logs/errors.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright 2015-2016 Yelp
+# Copyright 2015-2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +20,6 @@
 from .ids import _time_sort_key
 
 
-
 def _pick_error(log_interpretation):
     """Pick most recent error from a dictionary possibly containing
     step, history, and task interpretations. Returns None if there
@@ -42,7 +42,7 @@ def yield_errors():
                 yield error
 
     attempt_to_container_id = log_interpretation.get('history', {}).get(
-            'attempt_to_container_id', {})
+        'attempt_to_container_id', {})
 
     return _merge_and_sort_errors(yield_errors(), attempt_to_container_id)
 

diff --git a/mrjob/logs/history.py b/mrjob/logs/history.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright 2015-2016 Yelp
-# Copyright 2017 Yelp
+# Copyright 2015-2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/mrjob/logs/ids.py b/mrjob/logs/ids.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright 2015-2016 Yelp
+# Copyright 2015-2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -76,8 +77,6 @@ def _time_sort_key(d):
         attempt_num,
         task_num)
 
-    return sort_key
-
 
 def _add_implied_task_id(d):
     """If *d* (a dictionary) has *attempt_id* but not *task_id*, add it.