diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 3f8a3a7595458..153776d8b4171 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -305,22 +305,22 @@ def table(self, tableName): @since(1.4) def parquet(self, *paths, **options): - """Loads Parquet files, returning the result as a :class:`DataFrame`. + """ + Loads Parquet files, returning the result as a :class:`DataFrame`. + :param mergeSchema: sets whether we should merge schemas collected from all + Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. + The default value is specified in ``spark.sql.parquet.mergeSchema``. :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - You can set the following Parquet-specific option(s) for reading Parquet files: - * ``mergeSchema``: sets whether we should merge schemas collected from all \ - Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \ - The default value is specified in ``spark.sql.parquet.mergeSchema``. - >>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned') >>> df.dtypes [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] """ + mergeSchema = options.get('mergeSchema', None) recursiveFileLookup = options.get('recursiveFileLookup', None) - self._set_opts(recursiveFileLookup=recursiveFileLookup) + self._set_opts(mergeSchema=mergeSchema, recursiveFileLookup=recursiveFileLookup) return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths))) @ignore_unicode_prefix diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 93b4c78953860..6359c31ba5655 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -535,26 +535,25 @@ def orc(self, path, recursiveFileLookup=None): raise TypeError("path can be only a single string") @since(2.0) - def parquet(self, path, recursiveFileLookup=None): - """Loads a Parquet file stream, returning the result as a :class:`DataFrame`. + def parquet(self, path, mergeSchema=None, recursiveFileLookup=None): + """ + Loads a Parquet file stream, returning the result as a :class:`DataFrame`. + + .. note:: Evolving. + :param mergeSchema: sets whether we should merge schemas collected from all + Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. + The default value is specified in ``spark.sql.parquet.mergeSchema``. :param recursiveFileLookup: recursively scan a directory for files. Using this option disables `partition discovery`_. - You can set the following Parquet-specific option(s) for reading Parquet files: - * ``mergeSchema``: sets whether we should merge schemas collected from all \ - Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \ - The default value is specified in ``spark.sql.parquet.mergeSchema``. - - .. note:: Evolving. - >>> parquet_sdf = spark.readStream.schema(sdf_schema).parquet(tempfile.mkdtemp()) >>> parquet_sdf.isStreaming True >>> parquet_sdf.schema == sdf_schema True """ - self._set_opts(recursiveFileLookup=recursiveFileLookup) + self._set_opts(mergeSchema=mergeSchema, recursiveFileLookup=recursiveFileLookup) if isinstance(path, basestring): return self._df(self._jreader.parquet(path)) else: