From 6b081d5f5bad942bce7bab4053ecef1e3a9e077a Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Tue, 18 Jun 2024 17:20:01 +0900 Subject: [PATCH 1/4] [SPARK-48650][PYTHON] Display correct call site from IPython Notebook --- python/pyspark/errors/utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py index cddec3319964e..c76082b02b34f 100644 --- a/python/pyspark/errors/utils.py +++ b/python/pyspark/errors/utils.py @@ -153,6 +153,24 @@ def _capture_call_site( in the user code that led to the error. """ stack = list(reversed(inspect.stack())) + + # We try import here since IPython is not a required dependency + try: + from IPython import get_ipython + + if get_ipython(): + import pyspark + + # Filtering out PySpark code and keeping user code only + pyspark_root = os.path.dirname(pyspark.__file__) + stack = [ + frame_info + for frame_info in inspect.stack() + if pyspark_root not in frame_info.filename + ] + except ImportError: + pass + depth = int( spark_session.conf.get("spark.sql.stackTracesInDataFrameContext") # type: ignore[arg-type] ) From 3b628250b3f2e86d16c06c7d0589df0178e036cf Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Wed, 19 Jun 2024 08:38:42 +0900 Subject: [PATCH 2/4] Identify the cell number --- python/pyspark/errors/utils.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py index c76082b02b34f..fe6852b6d0f06 100644 --- a/python/pyspark/errors/utils.py +++ b/python/pyspark/errors/utils.py @@ -153,29 +153,37 @@ def _capture_call_site( in the user code that led to the error. """ stack = list(reversed(inspect.stack())) + ipython = None # We try import here since IPython is not a required dependency try: from IPython import get_ipython - if get_ipython(): - import pyspark - - # Filtering out PySpark code and keeping user code only - pyspark_root = os.path.dirname(pyspark.__file__) - stack = [ - frame_info - for frame_info in inspect.stack() - if pyspark_root not in frame_info.filename - ] + ipython = get_ipython() except ImportError: pass + if ipython: + import pyspark + + # Filtering out PySpark code and keeping user code only + pyspark_root = os.path.dirname(pyspark.__file__) + stack = [ + frame_info for frame_info in inspect.stack() if pyspark_root not in frame_info.filename + ] + depth = int( spark_session.conf.get("spark.sql.stackTracesInDataFrameContext") # type: ignore[arg-type] ) selected_frames = stack[:depth] - call_sites = [f"{frame.filename}:{frame.lineno}" for frame in selected_frames] + + # Identifying the cell is useful when the error is generated from IPython Notebook + if ipython: + call_sites = [ + f"line {frame.lineno} in cell [{ipython.execution_count}]" for frame in selected_frames + ] + else: + call_sites = [f"{frame.filename}:{frame.lineno}" for frame in selected_frames] call_sites_str = "\n".join(call_sites) pyspark_origin.set(fragment, call_sites_str) From bf3ddec9578def4603f57bc2ce3139b48dad1726 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Wed, 19 Jun 2024 09:23:17 +0900 Subject: [PATCH 3/4] respect depth --- python/pyspark/errors/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py index 3b4d8764af586..ab54f8789716d 100644 --- a/python/pyspark/errors/utils.py +++ b/python/pyspark/errors/utils.py @@ -184,9 +184,6 @@ def _capture_call_site(spark_session: "SparkSession", depth: int) -> str: frame_info for frame_info in inspect.stack() if pyspark_root not in frame_info.filename ] - depth = int( - spark_session.conf.get("spark.sql.stackTracesInDataFrameContext") # type: ignore[arg-type] - ) selected_frames = stack[:depth] # Identifying the cell is useful when the error is generated from IPython Notebook From 82c5f8560f5aa2969631ba47e6c853f4dc06f392 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Thu, 20 Jun 2024 14:40:47 +0900 Subject: [PATCH 4/4] Filterling out PySpark code properly --- python/pyspark/errors/utils.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py index ab54f8789716d..9155bfb54abe8 100644 --- a/python/pyspark/errors/utils.py +++ b/python/pyspark/errors/utils.py @@ -21,6 +21,7 @@ import os import threading from typing import Any, Callable, Dict, Match, TypeVar, Type, Optional, TYPE_CHECKING +import pyspark from pyspark.errors.error_classes import ERROR_CLASSES_MAP if TYPE_CHECKING: @@ -164,8 +165,13 @@ def _capture_call_site(spark_session: "SparkSession", depth: int) -> str: The call site information is used to enhance error messages with the exact location in the user code that led to the error. """ - stack = list(reversed(inspect.stack())) - ipython = None + # Filtering out PySpark code and keeping user code only + pyspark_root = os.path.dirname(pyspark.__file__) + stack = [ + frame_info for frame_info in inspect.stack() if pyspark_root not in frame_info.filename + ] + + selected_frames = stack[:depth] # We try import here since IPython is not a required dependency try: @@ -173,18 +179,7 @@ def _capture_call_site(spark_session: "SparkSession", depth: int) -> str: ipython = get_ipython() except ImportError: - pass - - if ipython: - import pyspark - - # Filtering out PySpark code and keeping user code only - pyspark_root = os.path.dirname(pyspark.__file__) - stack = [ - frame_info for frame_info in inspect.stack() if pyspark_root not in frame_info.filename - ] - - selected_frames = stack[:depth] + ipython = None # Identifying the cell is useful when the error is generated from IPython Notebook if ipython: