In [10]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder.appName("test spark datasource") \
    .master("local[*]") \
    .getOrCreate()
# You can set the MASTER environment variable when running examples
# to submit examples to a cluster.
# This can be a mesos:// or spark:// URL, "yarn" to run on YARN,
# and "local" to run locally with one thread,
# or "local[N]" to run locally with N threads.

df = spark.read.load("resources/users.parquet")
df.select("name", "favorite_color") \
    .write \
    .mode('overwrite') \
    .save("namesAndFavColors.parquet")
df.show()

## Manually Specifying Options
# (json, parquet, jdbc, orc, libsvm, csv, text)
df = spark.read.load("resources/people.json",format="json")
df.write.mode("overwrite") \
    .save("namesAndAges.parquet",format="parquet")


# csv
peopleDF = spark.read.load("resources/people.csv"
                     ,format="csv",sep=";",inferSchema="true",header="true")
peopleDF.show()

# orc
# The extra options are also used during write operation. For example, you can control bloom filters and dictionary encodings for ORC data sources. The following ORC example will create bloom filter and use dictionary encoding only for favorite_color. For Parquet, there exists parquet.bloom.filter.enabled and parquet.enable.dictionary, too. To find more detailed information about the extra ORC/Parquet options, visit the official Apache ORC / Parquet websites.
df = spark.read.orc("resources/users.orc")

df.write.format("orc")\
    .option("orc.bloom.filter.columns","favorite_color") \
    .option("orc.dictionary.key.threshold","1.0") \
    .option("orc.column.encoding.direct","name") \
    .mode("overwrite") \
    .save("users_with_options.orc")



+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [9]:
## Run SQL on files directly

df = spark.sql("select * from parquet.`resources/users.parquet`")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [None]:
## Save Modes

# Scala/Java	Any Language	Meaning
# SaveMode.ErrorIfExists (default)	"error" or "errorifexists" (default)	When saving a DataFrame to a data source, if data already exists, an exception is expected to be thrown.
# SaveMode.Append	"append"	When saving a DataFrame to a data source, if data/table already exists, contents of the DataFrame are expected to be appended to existing data.
# SaveMode.Overwrite	"overwrite"	Overwrite mode means that when saving a DataFrame to a data source, if data/table already exists, existing data is expected to be overwritten by the contents of the DataFrame.
# SaveMode.Ignore	"ignore"	Ignore mode means that when saving a DataFrame to a data source, if data already exists, the save operation is expected not to save the contents of the DataFrame and not to change the existing data. This is similar to a CREATE TABLE IF NOT EXISTS in SQL.


In [None]:
## Saving to Persistent Tables

# DataFrames can also be saved as persistent tables into Hive metastore using the saveAsTable command. Notice that an existing Hive deployment is not necessary to use this feature. Spark will create a default local Hive metastore (using Derby) for you
# For file-based data source, e.g. text, parquet, json, etc. you can specify a custom table path via the path option, e.g. df.write.option("path", "/some/path").saveAsTable("t"). When the table is dropped, the custom table path will not be removed and the table data is still there. If no custom table path is specified, Spark will write data to a default table path under the warehouse directory. When the table is dropped, the default table path will be removed too.

# Starting from Spark 2.1, persistent datasource tables have per-partition metadata stored in the Hive metastore. This brings several benefits:

# Since the metastore can return only necessary partitions for a query, discovering all the partitions on the first query to the table is no longer needed.
# Hive DDLs such as ALTER TABLE PARTITION ... SET LOCATION are now available for tables created with the Datasource API.
# Note that partition information is not gathered by default when creating external datasource tables (those with a path option). To sync the partition information in the metastore, you can invoke MSCK REPAIR TABLE.

In [12]:
## Bucketing, Sorting and Partitioning

# For file-based data source, it is also possible to bucket and sort or partition the output. Bucketing and sorting are applicable only to persistent tables:
# SQL = CREATE TABLE users_bucketed_by_name(
# name STRING,
# favorite_color STRING,
# favorite_numbers array<integer>
# ) USING parquet
# CLUSTERED BY(name) INTO 42 BUCKETS;
peopleDF.write.mode("overwrite") \
    .bucketBy(24,"name") \
    .sortBy("age") \
    .saveAsTable("people_bucketed")

# while partitioning can be used with both save and saveAsTable when using the Dataset APIs
# SQL = CREATE TABLE users_by_favorite_color(
# name STRING,
# favorite_color STRING,
# favorite_numbers array<integer>
# ) USING csv PARTITIONED BY(favorite_color);
df.write.mode("overwrite") \
    .partitionBy("favorite_color") \
    .format("parquet") \
    .save("namesPartByColor.parquet")

# It is possible to use both partitioning and bucketing for a single table:
df = spark.read.parquet("resources/users.parquet")
df.write.mode("overwrite") \
    .partitionBy("favorite_color") \
    .bucketBy(42,"name") \
    .saveAsTable("users_partitioned_bucketed")
# partitionBy creates a directory structure as described in the Partition Discovery section. Thus, it has limited applicability to columns with high cardinality. In contrast bucketBy distributes data across a fixed number of buckets and can be used when the number of unique values is unbounded.
