apache · viirya · May 23, 2017 · May 24, 2017 · May 24, 2017 · cloud-fan
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -479,7 +479,8 @@ object ParquetFileFormat extends Logging {
       partFiles: Seq[FileStatus],
       ignoreCorruptFiles: Boolean): Seq[Footer] = {
     val parFiles = partFiles.par
-    parFiles.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(8))
+    val pool = new ForkJoinPool(8)
+    parFiles.tasksupport = new ForkJoinTaskSupport(pool)
     parFiles.flatMap { currentFile =>
       try {
         // Skips row group information since we only need the schema.
@@ -495,6 +496,8 @@ object ParquetFileFormat extends Logging {
         } else {
           throw new IOException(s"Could not read footer for file: $currentFile", e)
         }
+      } finally {
+        pool.shutdown()
       }
     }.seq
   }

diff --git a/...est/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala b/...est/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala
@@ -26,6 +26,22 @@ import org.apache.spark.sql.test.SharedSQLContext
 
 class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {
 
+  test("Number of threads doesn't grow extremely after parquet file reading") {
+    withTempDir { dir =>
+      val file = dir.toString + "/file"
+      spark.range(1).toDF("a").coalesce(1).write.parquet(file)
+      spark.read.parquet(file)
+      val numThreadBefore = Thread.activeCount
+      (1 to 100).map { _ =>
+        spark.read.parquet(file)
+      }
+      val numThreadAfter = Thread.activeCount
+      // Hard to test a correct thread number,
+      // but it shouldn't increase more than a reasonable number.
+      assert(numThreadAfter - numThreadBefore < 20)
+    }
+  }
+
   test("read parquet footers in parallel") {
     def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
       withTempDir { dir =>