apache · viirya · May 23, 2017 · May 24, 2017 · May 24, 2017 · srowen
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -479,8 +479,9 @@ object ParquetFileFormat extends Logging {
       partFiles: Seq[FileStatus],
       ignoreCorruptFiles: Boolean): Seq[Footer] = {
     val parFiles = partFiles.par
-    parFiles.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(8))
-    parFiles.flatMap { currentFile =>
+    val readParquetTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8))
+    parFiles.tasksupport = readParquetTaskSupport
+    val footers = parFiles.flatMap { currentFile =>
       try {
         // Skips row group information since we only need the schema.
         // ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
@@ -497,6 +498,8 @@ object ParquetFileFormat extends Logging {
         }
       }
     }.seq
+    readParquetTaskSupport.forkJoinPool.shutdown()
+    footers
   }
 
   /**

diff --git a/...est/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala b/...est/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala
@@ -26,6 +26,22 @@ import org.apache.spark.sql.test.SharedSQLContext
 
 class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
+  test("Number of threads doesn't grow extremely after parquet file reading") {
+    withTempDir { dir =>
+      val file = dir.toString + "/file"
+      spark.range(1).toDF("a").coalesce(1).write.parquet(file)
+      spark.read.parquet(file)
+      val numThreadBefore = Thread.activeCount
+      (1 to 100).map { _ =>
+        spark.read.parquet(file)
+      }
+      val numThreadAfter = Thread.activeCount
+      // Hard to test a correct thread number,
+      // but it shouldn't increase more than a reasonable number.
+      assert(numThreadAfter - numThreadBefore < 20)
+    }
+  }
+
   test("read parquet footers in parallel") {
     def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
       withTempDir { dir =>