From 7cca82495b38d9e3c52a086958f07719981eb1cd Mon Sep 17 00:00:00 2001 From: Tahir Fayyaz Date: Tue, 15 Feb 2022 21:32:32 +0000 Subject: [PATCH] Updated Databricks docs for correct jobs 2.1 API and links (#21494) --- .../providers/databricks/hooks/databricks.py | 11 +++-- .../databricks/operators/databricks.py | 42 +++++++++---------- .../operators.rst | 10 ++--- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/airflow/providers/databricks/hooks/databricks.py b/airflow/providers/databricks/hooks/databricks.py index fd3dd99acfbbb..ee2523cf4e652 100644 --- a/airflow/providers/databricks/hooks/databricks.py +++ b/airflow/providers/databricks/hooks/databricks.py @@ -19,8 +19,11 @@ Databricks hook. This hook enable the submitting and running of jobs to the Databricks platform. Internally the -operators talk to the ``api/2.0/jobs/runs/submit`` -`endpoint `_. +operators talk to the +``api/2.1/jobs/run-now`` +`endpoint _` +or the ``api/2.1/jobs/runs/submit`` +`endpoint `_. """ import sys import time @@ -380,7 +383,7 @@ def _log_request_error(self, attempt_num: int, error: str) -> None: def run_now(self, json: dict) -> int: """ - Utility function to call the ``api/2.0/jobs/run-now`` endpoint. + Utility function to call the ``api/2.1/jobs/run-now`` endpoint. :param json: The data used in the body of the request to the ``run-now`` endpoint. :return: the run_id as an int @@ -391,7 +394,7 @@ def run_now(self, json: dict) -> int: def submit_run(self, json: dict) -> int: """ - Utility function to call the ``api/2.0/jobs/runs/submit`` endpoint. + Utility function to call the ``api/2.1/jobs/runs/submit`` endpoint. :param json: The data used in the body of the request to the ``submit`` endpoint. :return: the run_id as an int diff --git a/airflow/providers/databricks/operators/databricks.py b/airflow/providers/databricks/operators/databricks.py index 26c330815daa5..e53ff9a0849f8 100644 --- a/airflow/providers/databricks/operators/databricks.py +++ b/airflow/providers/databricks/operators/databricks.py @@ -101,14 +101,14 @@ def _handle_databricks_operator_execution(operator, hook, log, context) -> None: class DatabricksSubmitRunOperator(BaseOperator): """ Submits a Spark job run to Databricks using the - `api/2.0/jobs/runs/submit - `_ + `api/2.1/jobs/runs/submit + `_ API endpoint. There are two ways to instantiate this operator. In the first way, you can take the JSON payload that you typically use - to call the ``api/2.0/jobs/runs/submit`` endpoint and pass it directly + to call the ``api/2.1/jobs/runs/submit`` endpoint and pass it directly to our ``DatabricksSubmitRunOperator`` through the ``json`` parameter. For example :: @@ -129,7 +129,7 @@ class DatabricksSubmitRunOperator(BaseOperator): endpoint. In this method, your code would look like this: :: new_cluster = { - 'spark_version': '2.1.0-db3-scala2.11', + 'spark_version': '10.1.x-scala2.12', 'num_workers': 2 } notebook_task = { @@ -162,7 +162,7 @@ class DatabricksSubmitRunOperator(BaseOperator): :ref:`howto/operator:DatabricksSubmitRunOperator` :param json: A JSON object containing API parameters which will be passed - directly to the ``api/2.0/jobs/runs/submit`` endpoint. The other named parameters + directly to the ``api/2.1/jobs/runs/submit`` endpoint. The other named parameters (i.e. ``spark_jar_task``, ``notebook_task``..) to this operator will be merged with this json dictionary if they are provided. If there are conflicts during the merge, the named parameters will @@ -170,7 +170,7 @@ class DatabricksSubmitRunOperator(BaseOperator): .. seealso:: For more information about templating see :ref:`concepts:jinja-templating`. - https://docs.databricks.com/api/latest/jobs.html#runs-submit + https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit :param spark_jar_task: The main class and parameters for the JAR task. Note that the actual JAR is specified in the ``libraries``. *EITHER* ``spark_jar_task`` *OR* ``notebook_task`` *OR* ``spark_python_task`` @@ -178,28 +178,28 @@ class DatabricksSubmitRunOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#jobssparkjartask + https://docs.databricks.com/dev-tools/api/2.0/jobs.html#jobssparkjartask :param notebook_task: The notebook path and parameters for the notebook task. *EITHER* ``spark_jar_task`` *OR* ``notebook_task`` *OR* ``spark_python_task`` *OR* ``spark_submit_task`` *OR* ``pipeline_task`` should be specified. This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#jobsnotebooktask + https://docs.databricks.com/dev-tools/api/2.0/jobs.html#jobsnotebooktask :param spark_python_task: The python file path and parameters to run the python file with. *EITHER* ``spark_jar_task`` *OR* ``notebook_task`` *OR* ``spark_python_task`` *OR* ``spark_submit_task`` *OR* ``pipeline_task`` should be specified. This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#jobssparkpythontask + https://docs.databricks.com/dev-tools/api/2.0/jobs.html#jobssparkpythontask :param spark_submit_task: Parameters needed to run a spark-submit command. *EITHER* ``spark_jar_task`` *OR* ``notebook_task`` *OR* ``spark_python_task`` *OR* ``spark_submit_task`` *OR* ``pipeline_task`` should be specified. This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#jobssparksubmittask + https://docs.databricks.com/dev-tools/api/2.0/jobs.html#jobssparksubmittask :param pipeline_task: Parameters needed to execute a Delta Live Tables pipeline task. The provided dictionary must contain at least ``pipeline_id`` field! *EITHER* ``spark_jar_task`` *OR* ``notebook_task`` *OR* ``spark_python_task`` @@ -214,7 +214,7 @@ class DatabricksSubmitRunOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#jobsclusterspecnewcluster + https://docs.databricks.com/dev-tools/api/2.0/jobs.html#jobsclusterspecnewcluster :param existing_cluster_id: ID for existing cluster on which to run this task. *EITHER* ``new_cluster`` *OR* ``existing_cluster_id`` should be specified (except when ``pipeline_task`` is used). @@ -223,7 +223,7 @@ class DatabricksSubmitRunOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/libraries.html#managedlibrarieslibrary + https://docs.databricks.com/dev-tools/api/2.0/jobs.html#managedlibrarieslibrary :param run_name: The run name used for this task. By default this will be set to the Airflow ``task_id``. This ``task_id`` is a required parameter of the superclass ``BaseOperator``. @@ -344,14 +344,14 @@ def on_kill(self): class DatabricksRunNowOperator(BaseOperator): """ Runs an existing Spark job run to Databricks using the - `api/2.0/jobs/run-now - `_ + `api/2.1/jobs/run-now + `_ API endpoint. There are two ways to instantiate this operator. In the first way, you can take the JSON payload that you typically use - to call the ``api/2.0/jobs/run-now`` endpoint and pass it directly + to call the ``api/2.1/jobs/run-now`` endpoint and pass it directly to our ``DatabricksRunNowOperator`` through the ``json`` parameter. For example :: @@ -408,9 +408,9 @@ class DatabricksRunNowOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#run-now + https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow :param json: A JSON object containing API parameters which will be passed - directly to the ``api/2.0/jobs/run-now`` endpoint. The other named parameters + directly to the ``api/2.1/jobs/run-now`` endpoint. The other named parameters (i.e. ``notebook_params``, ``spark_submit_params``..) to this operator will be merged with this json dictionary if they are provided. If there are conflicts during the merge, the named parameters will @@ -418,7 +418,7 @@ class DatabricksRunNowOperator(BaseOperator): .. seealso:: For more information about templating see :ref:`concepts:jinja-templating`. - https://docs.databricks.com/api/latest/jobs.html#run-now + https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow :param notebook_params: A dict from keys to values for jobs with notebook task, e.g. "notebook_params": {"name": "john doe", "age": "35"}. The map is passed to the notebook and will be accessible through the @@ -442,7 +442,7 @@ class DatabricksRunNowOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#run-now + https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow :param jar_params: A list of parameters for jobs with JAR tasks, e.g. "jar_params": ["john doe", "35"]. The parameters will be passed to JAR file as command line parameters. @@ -453,7 +453,7 @@ class DatabricksRunNowOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#run-now + https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow :param spark_submit_params: A list of parameters for jobs with spark submit task, e.g. "spark_submit_params": ["--class", "org.apache.spark.examples.SparkPi"]. The parameters will be passed to spark-submit script as command line parameters. @@ -463,7 +463,7 @@ class DatabricksRunNowOperator(BaseOperator): This field will be templated. .. seealso:: - https://docs.databricks.com/api/latest/jobs.html#run-now + https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow :param timeout_seconds: The timeout for this run. By default a value of 0 is used which means to have no timeout. This field will be templated. diff --git a/docs/apache-airflow-providers-databricks/operators.rst b/docs/apache-airflow-providers-databricks/operators.rst index 0b8f8ac26f0ea..01c19a7987c56 100644 --- a/docs/apache-airflow-providers-databricks/operators.rst +++ b/docs/apache-airflow-providers-databricks/operators.rst @@ -24,14 +24,14 @@ DatabricksSubmitRunOperator =========================== Use the :class:`~airflow.providers.databricks.operators.DatabricksSubmitRunOperator` to submit -a new Databricks job via Databricks `api/2.0/jobs/runs/submit `_ API endpoint. +a new Databricks job via Databricks `api/2.1/jobs/runs/submit `_ API endpoint. Using the Operator ^^^^^^^^^^^^^^^^^^ There are two ways to instantiate this operator. In the first way, you can take the JSON payload that you typically use -to call the ``api/2.0/jobs/runs/submit`` endpoint and pass it directly to our ``DatabricksSubmitRunOperator`` through the ``json`` parameter. +to call the ``api/2.1/jobs/runs/submit`` endpoint and pass it directly to our ``DatabricksSubmitRunOperator`` through the ``json`` parameter. Another way to accomplish the same thing is to use the named parameters of the ``DatabricksSubmitRunOperator`` directly. Note that there is exactly one named parameter for each top level parameter in the ``runs/submit`` endpoint. @@ -91,15 +91,15 @@ You can also use named parameters to initialize the operator and run the job. DatabricksRunNowOperator =========================== -Use the :class:`~airflow.providers.databricks.operators.DatabricksRunNowOperator` to trigger run of existing Databricks job -via `api/2.0/jobs/runs/run-now `_ API endpoint. +Use the :class:`~airflow.providers.databricks.operators.DatabricksRunNowOperator` to trigger a run of an existing Databricks job +via `api/2.1/jobs/run-now `_ API endpoint. Using the Operator ^^^^^^^^^^^^^^^^^^ There are two ways to instantiate this operator. In the first way, you can take the JSON payload that you typically use -to call the ``api/2.0/jobs/run-now`` endpoint and pass it directly to our ``DatabricksRunNowOperator`` through the ``json`` parameter. +to call the ``api/2.1/jobs/run-now`` endpoint and pass it directly to our ``DatabricksRunNowOperator`` through the ``json`` parameter. Another way to accomplish the same thing is to use the named parameters of the ``DatabricksRunNowOperator`` directly. Note that there is exactly one named parameter for each top level parameter in the ``jobs/run-now`` endpoint.