From f7435bec6a9348cfbbe26b13c230c08545d16067 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@apache.org>
Date: Thu, 26 Apr 2018 15:11:42 -0700
Subject: [PATCH] [SPARK-24044][PYTHON] Explicitly print out skipped tests from
 unittest module

## What changes were proposed in this pull request?

This PR proposes to remove duplicated dependency checking logics and also print out skipped tests from unittests.

For example, as below:

```
Skipped tests in pyspark.sql.tests with pypy:
    test_createDataFrame_column_name_encoding (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 0.19.2 must be installed; however, it was not found.'
    test_createDataFrame_does_not_modify_input (pyspark.sql.tests.ArrowTests) ... skipped 'Pandas >= 0.19.2 must be installed; however, it was not found.'
...

Skipped tests in pyspark.sql.tests with python3:
    test_createDataFrame_column_name_encoding (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 0.8.0 must be installed; however, it was not found.'
    test_createDataFrame_does_not_modify_input (pyspark.sql.tests.ArrowTests) ... skipped 'PyArrow >= 0.8.0 must be installed; however, it was not found.'
...
```

Currently, it's not printed out in the console. I think we should better print out skipped tests in the console.

## How was this patch tested?

Manually tested. Also, fortunately, Jenkins has good environment to test the skipped output.

Author: hyukjinkwon <gurwls223@apache.org>

Closes #21107 from HyukjinKwon/skipped-tests-print.
---
 python/pyspark/ml/tests.py        |  16 +++--
 python/pyspark/mllib/tests.py     |   4 +-
 python/pyspark/sql/tests.py       |  51 +++++++------
 python/pyspark/streaming/tests.py |   4 +-
 python/pyspark/tests.py           |  12 +---
 python/run-tests.py               | 115 +++++++++++++-----------------
 6 files changed, 98 insertions(+), 104 deletions(-)

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 2ec0be60e9fa9..093593132e56d 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -2136,17 +2136,23 @@ class ImageReaderTest2(PySparkTestCase):
     @classmethod
     def setUpClass(cls):
         super(ImageReaderTest2, cls).setUpClass()
+        cls.hive_available = True
         # Note that here we enable Hive's support.
         cls.spark = None
         try:
             cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
         except py4j.protocol.Py4JError:
             cls.tearDownClass()
-            raise unittest.SkipTest("Hive is not available")
+            cls.hive_available = False
         except TypeError:
             cls.tearDownClass()
-            raise unittest.SkipTest("Hive is not available")
-        cls.spark = HiveContext._createForTesting(cls.sc)
+            cls.hive_available = False
+        if cls.hive_available:
+            cls.spark = HiveContext._createForTesting(cls.sc)
+
+    def setUp(self):
+        if not self.hive_available:
+            self.skipTest("Hive is not available.")
 
     @classmethod
     def tearDownClass(cls):
@@ -2662,6 +2668,6 @@ def testDefaultFitMultiple(self):
 if __name__ == "__main__":
     from pyspark.ml.tests import *
     if xmlrunner:
-        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2)
     else:
-        unittest.main()
+        unittest.main(verbosity=2)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 1037bab7f1088..14d788b0bef60 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -1767,9 +1767,9 @@ def test_pca(self):
     if not _have_scipy:
         print("NOTE: Skipping SciPy tests as it does not seem to be installed")
     if xmlrunner:
-        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2)
     else:
-        unittest.main()
+        unittest.main(verbosity=2)
     if not _have_scipy:
         print("NOTE: SciPy tests were skipped as it does not seem to be installed")
     sc.stop()
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 98fa1b54b0a17..6b28c557a803e 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3096,23 +3096,28 @@ def setUpClass(cls):
         filename_pattern = (
             "sql/core/target/scala-*/test-classes/org/apache/spark/sql/"
             "TestQueryExecutionListener.class")
-        if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)):
-            raise unittest.SkipTest(
+        cls.has_listener = bool(glob.glob(os.path.join(SPARK_HOME, filename_pattern)))
+
+        if cls.has_listener:
+            # Note that 'spark.sql.queryExecutionListeners' is a static immutable configuration.
+            cls.spark = SparkSession.builder \
+                .master("local[4]") \
+                .appName(cls.__name__) \
+                .config(
+                    "spark.sql.queryExecutionListeners",
+                    "org.apache.spark.sql.TestQueryExecutionListener") \
+                .getOrCreate()
+
+    def setUp(self):
+        if not self.has_listener:
+            raise self.skipTest(
                 "'org.apache.spark.sql.TestQueryExecutionListener' is not "
                 "available. Will skip the related tests.")
 
-        # Note that 'spark.sql.queryExecutionListeners' is a static immutable configuration.
-        cls.spark = SparkSession.builder \
-            .master("local[4]") \
-            .appName(cls.__name__) \
-            .config(
-                "spark.sql.queryExecutionListeners",
-                "org.apache.spark.sql.TestQueryExecutionListener") \
-            .getOrCreate()
-
     @classmethod
     def tearDownClass(cls):
-        cls.spark.stop()
+        if hasattr(cls, "spark"):
+            cls.spark.stop()
 
     def tearDown(self):
         self.spark._jvm.OnSuccessCall.clear()
@@ -3196,18 +3201,22 @@ class HiveContextSQLTests(ReusedPySparkTestCase):
     def setUpClass(cls):
         ReusedPySparkTestCase.setUpClass()
         cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        cls.hive_available = True
         try:
             cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
         except py4j.protocol.Py4JError:
-            cls.tearDownClass()
-            raise unittest.SkipTest("Hive is not available")
+            cls.hive_available = False
         except TypeError:
-            cls.tearDownClass()
-            raise unittest.SkipTest("Hive is not available")
+            cls.hive_available = False
         os.unlink(cls.tempdir.name)
-        cls.spark = HiveContext._createForTesting(cls.sc)
-        cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
-        cls.df = cls.sc.parallelize(cls.testData).toDF()
+        if cls.hive_available:
+            cls.spark = HiveContext._createForTesting(cls.sc)
+            cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
+            cls.df = cls.sc.parallelize(cls.testData).toDF()
+
+    def setUp(self):
+        if not self.hive_available:
+            self.skipTest("Hive is not available.")
 
     @classmethod
     def tearDownClass(cls):
@@ -5316,6 +5325,6 @@ def test_invalid_args(self):
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
-        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2)
     else:
-        unittest.main()
+        unittest.main(verbosity=2)
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 103940923dd4d..d77f1baa1f344 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1590,11 +1590,11 @@ def search_kinesis_asl_assembly_jar():
         sys.stderr.write("[Running %s]\n" % (testcase))
         tests = unittest.TestLoader().loadTestsFromTestCase(testcase)
         if xmlrunner:
-            result = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=3).run(tests)
+            result = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2).run(tests)
             if not result.wasSuccessful():
                 failed = True
         else:
-            result = unittest.TextTestRunner(verbosity=3).run(tests)
+            result = unittest.TextTestRunner(verbosity=2).run(tests)
             if not result.wasSuccessful():
                 failed = True
     sys.exit(failed)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 9111dbbed5929..8392d7f29af53 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -2353,15 +2353,7 @@ def test_statcounter_array(self):
 
 if __name__ == "__main__":
     from pyspark.tests import *
-    if not _have_scipy:
-        print("NOTE: Skipping SciPy tests as it does not seem to be installed")
-    if not _have_numpy:
-        print("NOTE: Skipping NumPy tests as it does not seem to be installed")
     if xmlrunner:
-        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'))
+        unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports'), verbosity=2)
     else:
-        unittest.main()
-    if not _have_scipy:
-        print("NOTE: SciPy tests were skipped as it does not seem to be installed")
-    if not _have_numpy:
-        print("NOTE: NumPy tests were skipped as it does not seem to be installed")
+        unittest.main(verbosity=2)
diff --git a/python/run-tests.py b/python/run-tests.py
index 6b41b5ee22814..f408fc5082b3d 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -32,6 +32,7 @@
 else:
     import queue as Queue
 from distutils.version import LooseVersion
+from multiprocessing import Manager
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -50,6 +51,7 @@ def print_red(text):
     print('\033[31m' + text + '\033[0m')
 
 
+SKIPPED_TESTS = Manager().dict()
 LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
 FAILURE_REPORTING_LOCK = Lock()
 LOGGER = logging.getLogger()
@@ -109,8 +111,34 @@ def run_individual_python_test(test_name, pyspark_python):
             # this code is invoked from a thread other than the main thread.
             os._exit(-1)
     else:
-        per_test_output.close()
-        LOGGER.info("Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
+        skipped_counts = 0
+        try:
+            per_test_output.seek(0)
+            # Here expects skipped test output from unittest when verbosity level is
+            # 2 (or --verbose option is enabled).
+            decoded_lines = map(lambda line: line.decode(), iter(per_test_output))
+            skipped_tests = list(filter(
+                lambda line: re.search('test_.* \(pyspark\..*\) ... skipped ', line),
+                decoded_lines))
+            skipped_counts = len(skipped_tests)
+            if skipped_counts > 0:
+                key = (pyspark_python, test_name)
+                SKIPPED_TESTS[key] = skipped_tests
+            per_test_output.close()
+        except:
+            import traceback
+            print_red("\nGot an exception while trying to store "
+                      "skipped test output:\n%s" % traceback.format_exc())
+            # Here, we use os._exit() instead of sys.exit() in order to force Python to exit even if
+            # this code is invoked from a thread other than the main thread.
+            os._exit(-1)
+        if skipped_counts != 0:
+            LOGGER.info(
+                "Finished test(%s): %s (%is) ... %s tests were skipped", pyspark_python, test_name,
+                duration, skipped_counts)
+        else:
+            LOGGER.info(
+                "Finished test(%s): %s (%is)", pyspark_python, test_name, duration)
 
 
 def get_default_python_executables():
@@ -152,65 +180,17 @@ def parse_opts():
     return opts
 
 
-def _check_dependencies(python_exec, modules_to_test):
-    if "COVERAGE_PROCESS_START" in os.environ:
-        # Make sure if coverage is installed.
-        try:
-            subprocess_check_output(
-                [python_exec, "-c", "import coverage"],
-                stderr=open(os.devnull, 'w'))
-        except:
-            print_red("Coverage is not installed in Python executable '%s' "
-                      "but 'COVERAGE_PROCESS_START' environment variable is set, "
-                      "exiting." % python_exec)
-            sys.exit(-1)
-
-    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
-    # explicitly prints out. See SPARK-23300.
-    if pyspark_sql in modules_to_test:
-        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
-        minimum_pyarrow_version = '0.8.0'
-        minimum_pandas_version = '0.19.2'
-
-        try:
-            pyarrow_version = subprocess_check_output(
-                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
-                universal_newlines=True,
-                stderr=open(os.devnull, 'w')).strip()
-            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
-                LOGGER.info("Will test PyArrow related features against Python executable "
-                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
-            else:
-                LOGGER.warning(
-                    "Will skip PyArrow related features against Python executable "
-                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
-                    "%s was found." % (
-                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
-        except:
-            LOGGER.warning(
-                "Will skip PyArrow related features against Python executable "
-                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
-                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))
-
-        try:
-            pandas_version = subprocess_check_output(
-                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
-                universal_newlines=True,
-                stderr=open(os.devnull, 'w')).strip()
-            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
-                LOGGER.info("Will test Pandas related features against Python executable "
-                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
-            else:
-                LOGGER.warning(
-                    "Will skip Pandas related features against Python executable "
-                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
-                    "%s was found." % (
-                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
-        except:
-            LOGGER.warning(
-                "Will skip Pandas related features against Python executable "
-                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
-                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
+def _check_coverage(python_exec):
+    # Make sure if coverage is installed.
+    try:
+        subprocess_check_output(
+            [python_exec, "-c", "import coverage"],
+            stderr=open(os.devnull, 'w'))
+    except:
+        print_red("Coverage is not installed in Python executable '%s' "
+                  "but 'COVERAGE_PROCESS_START' environment variable is set, "
+                  "exiting." % python_exec)
+        sys.exit(-1)
 
 
 def main():
@@ -237,9 +217,10 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
-        # Check if the python executable has proper dependencies installed to run tests
-        # for given modules properly.
-        _check_dependencies(python_exec, modules_to_test)
+        # Check if the python executable has coverage installed when 'COVERAGE_PROCESS_START'
+        # environmental variable is set.
+        if "COVERAGE_PROCESS_START" in os.environ:
+            _check_coverage(python_exec)
 
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
@@ -281,6 +262,12 @@ def process_queue(task_queue):
     total_duration = time.time() - start_time
     LOGGER.info("Tests passed in %i seconds", total_duration)
 
+    for key, lines in sorted(SKIPPED_TESTS.items()):
+        pyspark_python, test_name = key
+        LOGGER.info("\nSkipped tests in %s with %s:" % (test_name, pyspark_python))
+        for line in lines:
+            LOGGER.info("    %s" % line.rstrip())
+
 
 if __name__ == "__main__":
     main()