apache · ConeyLiu · Oct 24, 2019 · Oct 25, 2019 · Oct 25, 2019 · Oct 29, 2019
diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
@@ -53,6 +53,10 @@ def _getOrCreate(cls):
             cls._taskContext = TaskContext()
         return cls._taskContext
 
+    @classmethod
+    def _setTaskContext(cls, taskContext):
+        cls._taskContext = taskContext
+
     @classmethod
     def get(cls):
         """
@@ -162,7 +166,10 @@ def get(cls):
         running tasks.
 
         .. note:: Must be called on the worker, not the driver. Returns None if not initialized.
+            An Exception will raise if it is not in a barrier stage.
         """
+        if not isinstance(cls._taskContext, BarrierTaskContext):
+            raise Exception('It is not in a barrier stage')
         return cls._taskContext
 
     @classmethod

diff --git a/python/pyspark/tests/test_taskcontext.py b/python/pyspark/tests/test_taskcontext.py
@@ -25,6 +25,9 @@
 from pyspark import SparkConf, SparkContext, TaskContext, BarrierTaskContext
 from pyspark.testing.utils import PySparkTestCase, SPARK_HOME
 
+if sys.version_info[0] >= 3:
+    xrange = range
+
 
 class TaskContextTests(PySparkTestCase):
 
@@ -146,6 +149,49 @@ def f(iterator):
         self.assertTrue(len(taskInfos) == 4)
         self.assertTrue(len(taskInfos[0]) == 4)
 
+    def test_context_get(self):
+        """
+        Verify that TaskContext.get() works both in or not in a barrier stage.
+        """
+        rdd = self.sc.parallelize(range(10), 4)
+
+        def f(iterator):
+            taskContext = TaskContext.get()
+            if isinstance(taskContext, BarrierTaskContext):
+                yield taskContext.partitionId() + 1
+            elif isinstance(taskContext, TaskContext):
+                yield taskContext.partitionId() + 2
+            else:
+                yield -1
+
+        # for normal stage
+        result1 = rdd.mapPartitions(f).collect()
+        self.assertTrue(result1 == [2, 3, 4, 5])
+        # for barrier stage
+        result2 = rdd.barrier().mapPartitions(f).collect()
+        self.assertTrue(result2 == [1, 2, 3, 4])
+
+    def test_barrier_context_get(self):
+        """
+        Verify that BarrierTaskContext.get() should only works in a barrier stage.
+        """
+        rdd = self.sc.parallelize(range(10), 4)
+
+        def f(iterator):
+            try:
+                taskContext = BarrierTaskContext.get()
+            except Exception:
+                yield -1
+            else:
+                yield taskContext.partitionId()
+
+        # for normal stage
+        result1 = rdd.mapPartitions(f).collect()
+        self.assertTrue(result1 == [-1, -1, -1, -1])
+        # for barrier stage
+        result2 = rdd.barrier().mapPartitions(f).collect()
+        self.assertTrue(result2 == [0, 1, 2, 3])
+
 
 class TaskContextTestsWithWorkerReuse(unittest.TestCase):
 
@@ -181,6 +227,45 @@ def context_barrier(x):
         for pid in pids:
             self.assertTrue(pid in worker_pids)
 
+    def test_task_context_correct_with_python_worker_reuse(self):
+        """Verify the task context correct when reused python worker"""
+        # start a normal job first to start all workers and get all worker pids
+        worker_pids = self.sc.parallelize(xrange(2), 2).map(lambda x: os.getpid()).collect()
+        # the worker will reuse in this barrier job
+        rdd = self.sc.parallelize(xrange(10), 2)
+
+        def context(iterator):
+            tp = TaskContext.get().partitionId()
+            try:
+                bp = BarrierTaskContext.get().partitionId()
+            except Exception:
+                bp = -1
+
+            yield (tp, bp, os.getpid())
+
+        # normal stage after normal stage
+        normal_result = rdd.mapPartitions(context).collect()
+        tps, bps, pids = zip(*normal_result)
+        print(tps)
+        self.assertTrue(tps == (0, 1))
+        self.assertTrue(bps == (-1, -1))
+        for pid in pids:
+            self.assertTrue(pid in worker_pids)
+        # barrier stage after normal stage
+        barrier_result = rdd.barrier().mapPartitions(context).collect()
+        tps, bps, pids = zip(*barrier_result)
+        self.assertTrue(tps == (0, 1))
+        self.assertTrue(bps == (0, 1))
+        for pid in pids:
+            self.assertTrue(pid in worker_pids)
+        # normal stage after barrier stage
+        normal_result2 = rdd.mapPartitions(context).collect()
+        tps, bps, pids = zip(*normal_result2)
+        self.assertTrue(tps == (0, 1))
+        self.assertTrue(bps == (-1, -1))
+        for pid in pids:
+            self.assertTrue(pid in worker_pids)
+
     def tearDown(self):
         self.sc.stop()
 

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -503,6 +503,9 @@ def main(infile, outfile):
         if isBarrier:
             taskContext = BarrierTaskContext._getOrCreate()
             BarrierTaskContext._initialize(boundPort, secret)
+            # Set the task context instance here, so we can get it by TaskContext.get for
+            # both TaskContext and BarrierTaskContext
+            TaskContext._setTaskContext(taskContext)
         else:
             taskContext = TaskContext._getOrCreate()
         # read inputs for TaskContext info
@@ -596,6 +599,11 @@ def process():
             profiler.profile(process)
         else:
             process()
+
+        # Reset task context to None. This is a guard code to avoid residual context when worker
+        # reuse.
+        TaskContext._setTaskContext(None)
+        BarrierTaskContext._setTaskContext(None)
     except Exception:
         try:
             exc_info = traceback.format_exc()